|
from datasets import load_dataset |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
import os |
|
|
|
|
|
base_model_id = "mistralai/Mistral-7B-Instruct-v0.1" |
|
|
|
WORK = "vn_v2" |
|
new_model_id = f"kmichiru/Nikaido-7B-mistral-instruct-v0.3-{WORK}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
DSET = { |
|
"train": f"dataset_iroseka/{WORK}_train.jsonl", |
|
"eval": f"dataset_iroseka/{WORK}_val.jsonl" |
|
} |
|
|
|
|
|
dataset = load_dataset("json", data_files=DSET) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model_id) |
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
tokenizer.padding_side = "right" |
|
|
|
def dialogue(role, content): |
|
return { |
|
"role": role, |
|
"content": content |
|
} |
|
|
|
def format_chat_history(example): |
|
user_msgs = [] |
|
for msg in example["messages"]: |
|
if msg["role"] == "user": |
|
user_msgs.append(msg["content"]) |
|
messages = [ |
|
dialogue("user", "\n".join(user_msgs)), |
|
example["messages"][-1], |
|
] |
|
encodeds = tokenizer.apply_chat_template(messages, tokenize=False) |
|
return encodeds |
|
|
|
def prep_speaker(msg: str): |
|
msg = msg.replace("\u3000", " ") |
|
speaker, content = msg.split(":", 1) |
|
speaker = speaker.strip() |
|
content = content.strip() |
|
if len(speaker) == 0: |
|
speaker = "傍白" |
|
|
|
return f"{speaker}: {content}" |
|
|
|
|
|
def format_chat_history_v2(example): |
|
user_msg = [] |
|
user_msg.append("<s>") |
|
for msg in example["messages"]: |
|
|
|
if msg["role"] != "system": |
|
user_msg.append(f"[INST] {prep_speaker(msg['content'])} [/INST]") |
|
|
|
return " ".join(user_msg) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(format_chat_history_v2(dataset['train'][0])) |
|
|
|
def generate_and_tokenize_prompt(prompt, max_length=2048): |
|
result = tokenizer( |
|
format_chat_history_v2(prompt), |
|
truncation=True, |
|
max_length=max_length, |
|
padding="max_length", |
|
) |
|
result["labels"] = result["input_ids"] |
|
return result |
|
|
|
tokenized_dataset = dataset.map(generate_and_tokenize_prompt) |
|
print(tokenized_dataset['train'][0]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(tokenized_dataset['train'][0]) |
|
|
|
|
|
from peft import LoraConfig, get_peft_model |
|
def print_trainable_parameters(model): |
|
""" |
|
Prints the number of trainable parameters in the model. |
|
""" |
|
trainable_params = 0 |
|
all_param = 0 |
|
for _, param in model.named_parameters(): |
|
all_param += param.numel() |
|
if param.requires_grad: |
|
trainable_params += param.numel() |
|
print( |
|
f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param}" |
|
) |
|
model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16) |
|
|
|
peft_config = LoraConfig( |
|
r=64, |
|
lora_alpha=64, |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type="CAUSAL_LM", |
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"] |
|
) |
|
model = get_peft_model(model, peft_config) |
|
print_trainable_parameters(model) |
|
print(model) |
|
|
|
import wandb, os |
|
|
|
|
|
wandb_project = "NikaidoLM" |
|
if len(wandb_project) > 0: |
|
os.environ["WANDB_PROJECT"] = wandb_project |
|
|
|
import transformers |
|
from datetime import datetime |
|
|
|
project = wandb_project |
|
base_model_name = "mistral" |
|
run_name = base_model_name + "-" + project |
|
output_name = f"{run_name}-{WORK}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" |
|
output_dir = "/scratch/generalvision/mowentao/mistral-out/" + output_name |
|
|
|
trainer = transformers.Trainer( |
|
model=model, |
|
train_dataset=tokenized_dataset["train"], |
|
eval_dataset=tokenized_dataset["eval"], |
|
args=transformers.TrainingArguments( |
|
output_dir=output_dir, |
|
warmup_steps=500, |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=2, |
|
num_train_epochs=3, |
|
weight_decay=5e-4, |
|
|
|
learning_rate=1e-4, |
|
bf16=True, |
|
optim="paged_adamw_32bit", |
|
logging_steps=100, |
|
logging_dir=output_dir, |
|
save_strategy="steps", |
|
save_steps=500, |
|
evaluation_strategy="steps", |
|
eval_steps=100, |
|
do_eval=True, |
|
report_to="wandb", |
|
run_name=output_name, |
|
lr_scheduler_type="cosine", |
|
), |
|
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), |
|
) |
|
|
|
model.config.use_cache = False |
|
trainer.train() |
|
trainer.model.save_pretrained(new_model_id) |
|
wandb.finish() |
|
|