「.NET 開発基盤部会 Wiki」は、「Open棟梁Project」,「OSSコンソーシアム .NET開発基盤部会」によって運営されています。
huggingface/trl SFTTrainer
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
from datasets import load_dataset
dolly_dataset = load_dataset("kunishou/databricks-dolly-15k-ja")
# 簡易化のためinputの値が空のデータに絞る
# npakaさんの記事(https://note.com/npaka/n/nc7a4c20f98f8)を参考
dolly_train_dataset = dolly_dataset['train'].filter(lambda example: example['input'] == '')
print(dolly_train_dataset)
# データ件数は全部で10,417件
#Dataset({
# features: ['output', 'index', 'category', 'instruction', 'input'],
# num_rows: 10417
#})
print(dolly_train_dataset[0])
#{'output': 'イコクエイラクブカ',
# 'index': '1',
# 'category': 'classification',
# 'instruction': '魚の種類はどっち?イコクエイラクブカとロープ',
# 'input': ''}train_file = "data/train.jsonl"
dataset = datasets.load_dataset("json", data_files=train_file, split="train") # data_filesが1つの場合、splitの指定は無意味
dataset = datasets.DatasetDict({'train': dataset}) # ココのtrainは、dictionaryのキーとして機能する。{"answer":"XXX","question":"YYY","context":"ZZZ"}
{"answer":"XXX","question":"YYY","context":"ZZZ"}
{"answer":"XXX","question":"YYY","context":"ZZZ"}
...DatasetDict({
train: Dataset({
features: ...,
num_rows: ...,
})
})※ kunishou/databricks-dolly-15k-jaを使う例ではcontextを使用しないのでinput(≒context)の値が空のデータに絞っている。
| オプション | 説明 |
| device_map="auto" | GPUやCPUへ自動で重みを割り当て |
| torch_dtype=torch.bfloat16 | 省メモリ・高速化のためのデータ型(bfloat16)を明示 |
| trust_remote_code=True | モデル定義にカスタムコードが含まれる場合に許可 |
| force_download=True | キャッシュ無視して強制的に再取得 |
from transformers import AutoModelForCausalLM, AutoTokenizer
# open-calm-largeは約7億パラメータの小さめのLLMです。
# とはいえ、本記事で紹介するコードをそのまま使うとcolab pro+で使えるA100ギリギリかもしれません。(試してません)
# お手元のGPUのリソースが限られている場合、動作確認が目的であれば、open-calm-smallに切り替えるなどしていただくと良いかもしれません。
model_name = "cyberagent/open-calm-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
# torch_dtype=torch.float16,
# torch.float16を指定すると、train時のlossが0.0になって学習がうまくいかない。
# TrainerArgumentsを修正(bf16 = True)したところ正常に評価できる(TPUで事前学習したモデル)。
)save_pretrainedしていれば、model = AutoModelForCausalLM.from_pretrained('./output', device_map="auto")でロードできる。GPTNeoXForCausalLM(
(gpt_neox): GPTNeoXModel(
(embed_in): Embedding(52096, 1536)
(emb_dropout): Dropout(p=0.0, inplace=False)
(layers): ModuleList(
(0-23): 24 x GPTNeoXLayer(
(input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
(post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
(post_attention_dropout): Dropout(p=0.0, inplace=False)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
(attention): GPTNeoXAttention(
(rotary_emb): GPTNeoXRotaryEmbedding()
(query_key_value): Linear(in_features=1536, out_features=4608, bias=True)
(dense): Linear(in_features=1536, out_features=1536, bias=True)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): GPTNeoXMLP(
(dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True)
(dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True)
(act): GELUActivation()
)
)
)
(final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
)
(embed_out): Linear(in_features=1536, out_features=52096, bias=False)
)import transformers
from transformers import (
AutoTokenizer
)
import torch
MODEL_URI = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_PATH = 'models/Llama-3.2-1B-Instruct'
TOKENIZER_PATH = 'models/Llama-3.2-1B-Instruct'
# model
model = transformers.AutoModelForCausalLM.from_pretrained(
MODEL_URI,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
force_download=True
)
model.save_pretrained(MODEL_PATH)
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_URI)
tokenizer.save_pretrained(TOKENIZER_PATH) # model
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
# tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # コレは要るん?...
Instruction Tuningは、言語モデルに 命令(Instruction)とその応答(Response)を学習させることで、指示に従う能力を高める手法
text = f"以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい\n\n### 指示: \n{example['instruction'][i]} \n\n### 応答: \n{example['output'][i]}<|endoftext|>"text = f"Please answer the question based on the given context. \n\n### question\n{example['question'][i]}\n\n ### context\n{example['context'][i]}\n\n### answer\n{example['answer'][i]}<|endoftext|>"print(tokenizer.eos_token)
#'<|endoftext|>'
def formatting_prompts_func(example):
output_texts = []
for i in range(len(example['instruction'])):
text = f"以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい\n\n### 指示:\n{example['instruction'][i]}\n\n### 応答:\n{example['output'][i]}<|endoftext|>"
output_texts.append(text)
return output_textsdef formatting_prompts_func(example):
output_texts = []
for i in range(len(example['question'])):
text = f"Please answer the question based on the given context.\n\n### question\n{example['question'][i]}\n\n### context\n{example['context'][i]}\n\n### answer\n{example['answer'][i]}<|endoftext|>"
output_texts.append(text)
return output_textsresponse_template = "### 応答:\n" # "### answer\n"
instruction_template = "### 指示:\n" # "### question\n"
from trl import DataCollatorForCompletionOnlyLM # response_templateは必須指定 response_template = "### 応答:\n" collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
from transformers import TrainingArguments
from trl import SFTTrainer
# SFTTrainerはTrainingArgumentsを使用することができる。
# 指定しない場合、TrainingArgumentsのデフォルトが指定される。
args = TrainingArguments(
output_dir='./output',
num_train_epochs=2,
gradient_accumulation_steps=8,
per_device_train_batch_size=8,
save_strategy="no",
logging_steps=20,
lr_scheduler_type="constant",
save_total_limit=1,
fp16=True,
)
trainer = SFTTrainer(
model,
args=args,
train_dataset=dolly_train_dataset,
formatting_func=formatting_prompts_func,
max_seq_length=1024,
data_collator=collator,
)print(trainer.train_dataset)
# Dataset({
# features: ['input_ids', 'attention_mask'],
# num_rows: 10417
# })
print(tokenizer.decode(trainer.train_dataset[0]['input_ids']))
# 以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい
#
# ### 指示:
# XXXXXXXXXX
#
# ### 応答:
# YYYYYYYYYY<|endoftext|>from torch.utils.data import DataLoader loader = DataLoader(trainer.train_dataset, collate_fn=collator, batch_size=8) batch = next(iter(loader)) print(batch['labels'][0]) #tensor([ -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, 275, 19052, 4044, 2048, 431, 367, # 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100])
trainer.train() trainer.save_model()
lora_model = AutoModelForCausalLM.from_pretrained('./output', device_map="auto")from peft import LoraConfig
peft_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
args = TrainingArguments(
output_dir='./output_lora',
...同上...,
)
trainer = SFTTrainer(
...同上...,
peft_config=peft_config,
)
trainer.train()
trainer.save_model()lora_model = AutoModelForCausalLM.from_pretrained('./output_lora', device_map="auto")# 最初の層のFFNの一部を確認する # この時点ではrequires_grad=Trueになっている first_ffn_param = model.gpt_neox.layers[0].mlp.dense_h_to_4h.weight print(first_ffn_param) #Parameter containing: #tensor([[-0.1072, 0.0417, -0.0432, ..., -0.0873, -0.1708, -0.1608], # [-0.0934, 0.0773, 0.0074, ..., -0.2107, 0.0881, -0.0803], # [-0.0506, -0.1282, -0.1511, ..., 0.1120, -0.0126, -0.1172], # ..., # [ 0.1274, -0.0688, 0.1787, ..., 0.1432, 0.0266, -0.1370], # [-0.1108, -0.0758, 0.0035, ..., -0.0404, -0.1801, 0.0338], # [ 0.0669, 0.0399, -0.0443, ..., -0.2275, -0.1323, 0.0034]], # device='cuda:0', requires_grad=True)
PeftModelForCausalLM(
(base_model): LoraModel(
(model): GPTNeoXForCausalLM(
(gpt_neox): GPTNeoXModel(
(embed_in): Embedding(52096, 1536)
(emb_dropout): Dropout(p=0.0, inplace=False)
(layers): ModuleList(
(0-23): 24 x GPTNeoXLayer(
(input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
(post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
(post_attention_dropout): Dropout(p=0.0, inplace=False)
(post_mlp_dropout): Dropout(p=0.0, inplace=False)
(attention): GPTNeoXAttention(
(rotary_emb): GPTNeoXRotaryEmbedding()
(query_key_value): Linear(
in_features=1536, out_features=4608, bias=True
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=1536, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=4608, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
)
(dense): Linear(in_features=1536, out_features=1536, bias=True)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): GPTNeoXMLP(
(dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True)
(dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True)
(act): GELUActivation()
)
)
)
(final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
)
(embed_out): Linear(in_features=1536, out_features=52096, bias=False)
)
)
)trainer.model.print_trainable_parameters() #trainable params: 1,179,648 || all params: 841,178,112 || trainable%: 0.14023760047622352
# Adapterが挿入されていない層はパラメタはbaseモデルのときとまったく同じ (lora_model.gpt_neox.layers[0].mlp.dense_h_to_4h.weight == first_ffn_param).all() #tensor(True, device='cuda:0') ## GPU上にあるテンソルの全要素が一致
The AI community building the future.