import os from dotenv import load_dotenv from datasets import load_dataset, concatenate_datasets from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments from huggingface_hub import login # === トークン読み込み === load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("Hugging Faceのトークンが見つかりません。`.env`ファイルまたは環境変数を確認してください。") login(HF_TOKEN) # === 設定 === BASE_MODEL = "Sakalti/template-4" HF_REPO = "Sakalti/template-16" HachiML/alpaca_jp_python # === データ読み込み === dataset1 = load_dataset("Verah/JParaCrawl-Filtered-English-Japanese-Parallel-Corpus", split="train") dataset2 = load_dataset("HachiML/alpaca_jp_python", split="train") dataset3 = load_dataset("HachiML/alpaca_jp_math", split="train") dataset = concatenate_dataset([dataset1],[dataset2],[dataset3]) # === トークナイザー & モデル準備 === tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) model = AutoModelForCausalLM.from_pretrained(BASE_MODEL) # === トークナイズ関数修正版 === def preprocess(examples): texts = [english + " " + japanese for english, japanese in zip(examples["english"], examples["japanese"])] tokenized = tokenizer(texts, max_length=256, truncation=True) tokenized["labels"] = tokenized["input_ids"].copy() return tokenized tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names) # === トレーニング設定 === training_args = TrainingArguments( output_dir="./results", evaluation_strategy="no", learning_rate=2e-5, per_device_train_batch_size=2, num_train_epochs=3, save_total_limit=2, save_steps=500, push_to_hub=True, hub_model_id=HF_REPO, hub_token=HF_TOKEN, logging_steps=100, ) # === Trainerで学習 & アップロード === trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) trainer.train() trainer.push_to_hub() tokenizer.push_to_hub(HF_REPO) print("アップロード完了!")