from datasets import load_dataset |
import torch |
from transformers import AutoTokenizer, AutoModelForCausalLM |
import os |
base_model_id = "mistralai/Mistral-7B-Instruct-v0.1" |
WORK = "vn_v2" |
new_model_id = f"kmichiru/Nikaido-7B-mistral-instruct-v0.3-{WORK}" |
DSET = { |
"train": f"dataset_iroseka/{WORK}_train.jsonl", |
"eval": f"dataset_iroseka/{WORK}_val.jsonl" |
} |
dataset = load_dataset("json", data_files=DSET) |
tokenizer = AutoTokenizer.from_pretrained(base_model_id) |
tokenizer.pad_token = tokenizer.eos_token |
tokenizer.padding_side = "right" |
def dialogue(role, content): |
return { |
"role": role, |
"content": content |
} |
def format_chat_history(example): |
user_msgs = [] |
for msg in example["messages"]: |
if msg["role"] == "user": |
user_msgs.append(msg["content"]) |
messages = [ |
dialogue("user", "\n".join(user_msgs)), |
example["messages"][-1], |
] |
encodeds = tokenizer.apply_chat_template(messages, tokenize=False) |
return encodeds |
def prep_speaker(msg: str): |
msg = msg.replace("\u3000", " ") |
speaker, content = msg.split(":", 1) |
speaker = speaker.strip() |
content = content.strip() |
if len(speaker) == 0: |
speaker = "傍白" |
return f"{speaker}: {content}" |
def format_chat_history_v2(example): |
user_msg = [] |
user_msg.append("<s>") |
for msg in example["messages"]: |
if msg["role"] != "system": |
user_msg.append(f"[INST] {prep_speaker(msg['content'])} [/INST]") |
return " ".join(user_msg) |
print(format_chat_history_v2(dataset['train'][0])) |
def generate_and_tokenize_prompt(prompt, max_length=2048): |
result = tokenizer( |
format_chat_history_v2(prompt), |
truncation=True, |
max_length=max_length, |
padding="max_length", |
) |
result["labels"] = result["input_ids"] |
return result |
tokenized_dataset = dataset.map(generate_and_tokenize_prompt) |
print(tokenized_dataset['train'][0]) |
print(tokenized_dataset['train'][0]) |
from peft import LoraConfig, get_peft_model |
def print_trainable_parameters(model): |
""" |
Prints the number of trainable parameters in the model. |
""" |
trainable_params = 0 |
all_param = 0 |
for _, param in model.named_parameters(): |
all_param += param.numel() |
if param.requires_grad: |
trainable_params += param.numel() |
print( |
f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param}" |
) |
model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16) |
peft_config = LoraConfig( |
r=64, |
lora_alpha=64, |
lora_dropout=0.05, |
bias="none", |
task_type="CAUSAL_LM", |
target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"] |
) |
model = get_peft_model(model, peft_config) |
print_trainable_parameters(model) |
print(model) |
import wandb, os |
wandb_project = "NikaidoLM" |
if len(wandb_project) > 0: |
os.environ["WANDB_PROJECT"] = wandb_project |
import transformers |
from datetime import datetime |
project = wandb_project |
base_model_name = "mistral" |
run_name = base_model_name + "-" + project |
output_name = f"{run_name}-{WORK}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" |
output_dir = "/scratch/generalvision/mowentao/mistral-out/" + output_name |
trainer = transformers.Trainer( |
model=model, |
train_dataset=tokenized_dataset["train"], |
eval_dataset=tokenized_dataset["eval"], |
args=transformers.TrainingArguments( |
output_dir=output_dir, |
warmup_steps=500, |
per_device_train_batch_size=1, |
gradient_accumulation_steps=2, |
num_train_epochs=3, |
weight_decay=5e-4, |
learning_rate=1e-4, |
bf16=True, |
optim="paged_adamw_32bit", |
logging_steps=100, |
logging_dir=output_dir, |
save_strategy="steps", |
save_steps=500, |
evaluation_strategy="steps", |
eval_steps=100, |
do_eval=True, |
report_to="wandb", |
run_name=output_name, |
lr_scheduler_type="cosine", |
), |
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), |
) |
model.config.use_cache = False |
trainer.train() |
trainer.model.save_pretrained(new_model_id) |
wandb.finish() |