「.NET 開発基盤部会 Wiki」は、「Open棟梁Project」,「OSSコンソーシアム .NET開発基盤部会」によって運営されています。
huggingface/trl SFTTrainer
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
from datasets import load_dataset dolly_dataset = load_dataset("kunishou/databricks-dolly-15k-ja") # 簡易化のためinputの値が空のデータに絞る # npakaさんの記事(https://note.com/npaka/n/nc7a4c20f98f8)を参考 dolly_train_dataset = dolly_dataset['train'].filter(lambda example: example['input'] == '') print(dolly_train_dataset) # データ件数は全部で10,417件 #Dataset({ # features: ['output', 'index', 'category', 'instruction', 'input'], # num_rows: 10417 #}) print(dolly_train_dataset[0]) #{'output': 'イコクエイラクブカ', # 'index': '1', # 'category': 'classification', # 'instruction': '魚の種類はどっち?イコクエイラクブカとロープ', # 'input': ''}
train_file = "data/train.jsonl" dataset = datasets.load_dataset("json", data_files=train_file, split="train") # data_filesが1つの場合、splitの指定は無意味 dataset = datasets.DatasetDict({'train': dataset}) # ココのtrainは、dictionaryのキーとして機能する。
{"answer":"XXX","question":"YYY","context":"ZZZ"} {"answer":"XXX","question":"YYY","context":"ZZZ"} {"answer":"XXX","question":"YYY","context":"ZZZ"} ...
DatasetDict({ train: Dataset({ features: ..., num_rows: ..., }) })
※ kunishou/databricks-dolly-15k-jaを使う例ではcontextを使用しないのでinput(≒context)の値が空のデータに絞っている。
オプション | 説明 |
device_map="auto" | GPUやCPUへ自動で重みを割り当て |
torch_dtype=torch.bfloat16 | 省メモリ・高速化のためのデータ型(bfloat16)を明示 |
trust_remote_code=True | モデル定義にカスタムコードが含まれる場合に許可 |
force_download=True | キャッシュ無視して強制的に再取得 |
from transformers import AutoModelForCausalLM, AutoTokenizer # open-calm-largeは約7億パラメータの小さめのLLMです。 # とはいえ、本記事で紹介するコードをそのまま使うとcolab pro+で使えるA100ギリギリかもしれません。(試してません) # お手元のGPUのリソースが限られている場合、動作確認が目的であれば、open-calm-smallに切り替えるなどしていただくと良いかもしれません。 model_name = "cyberagent/open-calm-large" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", # torch_dtype=torch.float16, # torch.float16を指定すると、train時のlossが0.0になって学習がうまくいかない。 # TrainerArgumentsを修正(bf16 = True)したところ正常に評価できる(TPUで事前学習したモデル)。)
save_pretrainedしていれば、model = AutoModelForCausalLM.from_pretrained('./output', device_map="auto")でロードできる。
GPTNeoXForCausalLM( (gpt_neox): GPTNeoXModel( (embed_in): Embedding(52096, 1536) (emb_dropout): Dropout(p=0.0, inplace=False) (layers): ModuleList( (0-23): 24 x GPTNeoXLayer( (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) (post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) (post_attention_dropout): Dropout(p=0.0, inplace=False) (post_mlp_dropout): Dropout(p=0.0, inplace=False) (attention): GPTNeoXAttention( (rotary_emb): GPTNeoXRotaryEmbedding() (query_key_value): Linear(in_features=1536, out_features=4608, bias=True) (dense): Linear(in_features=1536, out_features=1536, bias=True) (attention_dropout): Dropout(p=0.0, inplace=False) ) (mlp): GPTNeoXMLP( (dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True) (dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True) (act): GELUActivation() ) ) ) (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) ) (embed_out): Linear(in_features=1536, out_features=52096, bias=False) )
import transformers from transformers import ( AutoTokenizer ) import torch MODEL_URI = "meta-llama/Llama-3.2-1B-Instruct" MODEL_PATH = 'models/Llama-3.2-1B-Instruct' TOKENIZER_PATH = 'models/Llama-3.2-1B-Instruct' # model model = transformers.AutoModelForCausalLM.from_pretrained( MODEL_URI, torch_dtype=torch.bfloat16, trust_remote_code=True, force_download=True ) model.save_pretrained(MODEL_PATH) # tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_URI) tokenizer.save_pretrained(TOKENIZER_PATH)
# model model = AutoModelForCausalLM.from_pretrained(MODEL_PATH) # tokenizer tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # コレは要るん?
...
Instruction Tuningは、言語モデルに 命令(Instruction)とその応答(Response)を学習させることで、指示に従う能力を高める手法
text = f"以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい\n\n### 指示: \n{example['instruction'][i]} \n\n### 応答: \n{example['output'][i]}<|endoftext|>"
text = f"Please answer the question based on the given context. \n\n### question\n{example['question'][i]}\n\n ### context\n{example['context'][i]}\n\n### answer\n{example['answer'][i]}<|endoftext|>"
print(tokenizer.eos_token) #'<|endoftext|>' def formatting_prompts_func(example): output_texts = [] for i in range(len(example['instruction'])): text = f"以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい\n\n### 指示:\n{example['instruction'][i]}\n\n### 応答:\n{example['output'][i]}<|endoftext|>" output_texts.append(text) return output_texts
def formatting_prompts_func(example): output_texts = [] for i in range(len(example['question'])): text = f"Please answer the question based on the given context.\n\n### question\n{example['question'][i]}\n\n### context\n{example['context'][i]}\n\n### answer\n{example['answer'][i]}<|endoftext|>" output_texts.append(text) return output_texts
response_template = "### 応答:\n" # "### answer\n"
instruction_template = "### 指示:\n" # "### question\n"
from trl import DataCollatorForCompletionOnlyLM # response_templateは必須指定 response_template = "### 応答:\n" collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
from transformers import TrainingArguments from trl import SFTTrainer # SFTTrainerはTrainingArgumentsを使用することができる。 # 指定しない場合、TrainingArgumentsのデフォルトが指定される。 args = TrainingArguments( output_dir='./output', num_train_epochs=2, gradient_accumulation_steps=8, per_device_train_batch_size=8, save_strategy="no", logging_steps=20, lr_scheduler_type="constant", save_total_limit=1, fp16=True, ) trainer = SFTTrainer( model, args=args, train_dataset=dolly_train_dataset, formatting_func=formatting_prompts_func, max_seq_length=1024, data_collator=collator, )
print(trainer.train_dataset) # Dataset({ # features: ['input_ids', 'attention_mask'], # num_rows: 10417 # }) print(tokenizer.decode(trainer.train_dataset[0]['input_ids'])) # 以下は、タスクを説明する指示です。要求を適切に満たす応答を書きなさい # # ### 指示: # XXXXXXXXXX # # ### 応答: # YYYYYYYYYY<|endoftext|>
from torch.utils.data import DataLoader loader = DataLoader(trainer.train_dataset, collate_fn=collator, batch_size=8) batch = next(iter(loader)) print(batch['labels'][0]) #tensor([ -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, 275, 19052, 4044, 2048, 431, 367, # 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # -100, -100, -100, -100, -100])
trainer.train() trainer.save_model()
lora_model = AutoModelForCausalLM.from_pretrained('./output', device_map="auto")
from peft import LoraConfig peft_config = LoraConfig( r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) args = TrainingArguments( output_dir='./output_lora', ...同上..., ) trainer = SFTTrainer( ...同上..., peft_config=peft_config, ) trainer.train() trainer.save_model()
lora_model = AutoModelForCausalLM.from_pretrained('./output_lora', device_map="auto")
# 最初の層のFFNの一部を確認する # この時点ではrequires_grad=Trueになっている first_ffn_param = model.gpt_neox.layers[0].mlp.dense_h_to_4h.weight print(first_ffn_param) #Parameter containing: #tensor([[-0.1072, 0.0417, -0.0432, ..., -0.0873, -0.1708, -0.1608], # [-0.0934, 0.0773, 0.0074, ..., -0.2107, 0.0881, -0.0803], # [-0.0506, -0.1282, -0.1511, ..., 0.1120, -0.0126, -0.1172], # ..., # [ 0.1274, -0.0688, 0.1787, ..., 0.1432, 0.0266, -0.1370], # [-0.1108, -0.0758, 0.0035, ..., -0.0404, -0.1801, 0.0338], # [ 0.0669, 0.0399, -0.0443, ..., -0.2275, -0.1323, 0.0034]], # device='cuda:0', requires_grad=True)
PeftModelForCausalLM( (base_model): LoraModel( (model): GPTNeoXForCausalLM( (gpt_neox): GPTNeoXModel( (embed_in): Embedding(52096, 1536) (emb_dropout): Dropout(p=0.0, inplace=False) (layers): ModuleList( (0-23): 24 x GPTNeoXLayer( (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) (post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) (post_attention_dropout): Dropout(p=0.0, inplace=False) (post_mlp_dropout): Dropout(p=0.0, inplace=False) (attention): GPTNeoXAttention( (rotary_emb): GPTNeoXRotaryEmbedding() (query_key_value): Linear( in_features=1536, out_features=4608, bias=True (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=1536, out_features=8, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=8, out_features=4608, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() ) (dense): Linear(in_features=1536, out_features=1536, bias=True) (attention_dropout): Dropout(p=0.0, inplace=False) ) (mlp): GPTNeoXMLP( (dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True) (dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True) (act): GELUActivation() ) ) ) (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) ) (embed_out): Linear(in_features=1536, out_features=52096, bias=False) ) ) )
trainer.model.print_trainable_parameters() #trainable params: 1,179,648 || all params: 841,178,112 || trainable%: 0.14023760047622352
# Adapterが挿入されていない層はパラメタはbaseモデルのときとまったく同じ (lora_model.gpt_neox.layers[0].mlp.dense_h_to_4h.weight == first_ffn_param).all() #tensor(True, device='cuda:0') ## GPU上にあるテンソルの全要素が一致
The AI community building the future.