|
import os |
|
from dotenv import load_dotenv |
|
from datasets import load_dataset, concatenate_datasets |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments |
|
from huggingface_hub import login |
|
|
|
|
|
load_dotenv() |
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
if not HF_TOKEN: |
|
raise ValueError("Hugging Faceのトークンが見つかりません。`.env`ファイルまたは環境変数を確認してください。") |
|
login(HF_TOKEN) |
|
|
|
|
|
BASE_MODEL = "Sakalti/template-4" |
|
HF_REPO = "Sakalti/template-16" |
|
HachiML/alpaca_jp_python |
|
|
|
dataset1 = load_dataset("Verah/JParaCrawl-Filtered-English-Japanese-Parallel-Corpus", split="train") |
|
dataset2 = load_dataset("HachiML/alpaca_jp_python", split="train") |
|
dataset3 = load_dataset("HachiML/alpaca_jp_math", split="train") |
|
dataset = concatenate_dataset([dataset1],[dataset2],[dataset3]) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
|
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL) |
|
|
|
|
|
def preprocess(examples): |
|
texts = [english + " " + japanese for english, japanese in zip(examples["english"], examples["japanese"])] |
|
tokenized = tokenizer(texts, max_length=256, truncation=True) |
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
return tokenized |
|
|
|
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
evaluation_strategy="no", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=2, |
|
num_train_epochs=3, |
|
save_total_limit=2, |
|
save_steps=500, |
|
push_to_hub=True, |
|
hub_model_id=HF_REPO, |
|
hub_token=HF_TOKEN, |
|
logging_steps=100, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
) |
|
|
|
trainer.train() |
|
trainer.push_to_hub() |
|
tokenizer.push_to_hub(HF_REPO) |
|
|
|
print("アップロード完了!") |