File size: 5,728 Bytes
77aed32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# %%
import torch
# GPUが使用可能か判断
if torch.cuda.is_available():
print('gpu is available')
else:
raise Exception('gpu is NOT available')
# %%
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
import torch
import random
# %%
from pprint import pprint
from datasets import load_dataset
# Hugging Face Hub上のllm-book/wrime-sentimentのリポジトリから
# データを読み込む
train_dataset = load_dataset("llm-book/wrime-sentiment", split="train", remove_neutral=False)
valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation", remove_neutral=False)
# pprintで見やすく表示する
pprint(train_dataset)
pprint(valid_dataset)
# %%
# トークナイザのロード
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# %%
# トークナイズ処理
def preprocess_text(batch):
encoded_batch = tokenizer(batch['sentence'], max_length=512)
encoded_batch['labels'] = batch['label']
return encoded_batch
encoded_train_dataset = train_dataset.map(
preprocess_text,
remove_columns=train_dataset.column_names,
)
encoded_valid_dataset = valid_dataset.map(
preprocess_text,
remove_columns=valid_dataset.column_names,
)
# ミニバッチ構築
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# %%
# モデルの準備
from transformers import AutoModelForSequenceClassification
class_label = train_dataset.features["label"]
label2id = {label: id for id, label in enumerate(class_label.names)}
id2label = {id: label for id, label in enumerate(class_label.names)}
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=class_label.num_classes,
label2id=label2id, # ラベル名からIDへの対応を指定
id2label=id2label, # IDからラベル名への対応を指定
)
print(type(model).__name__)
# %%
# 訓練の準備
from transformers import TrainingArguments
# 保存ディレクトリ
save_dir = f'bert-finetuned-wrime-base'
training_args = TrainingArguments(
output_dir=save_dir, # 結果の保存フォルダ
per_device_train_batch_size=32, # 訓練時のバッチサイズ
per_device_eval_batch_size=32, # 評価時のバッチサイズ
learning_rate=2e-5, # 学習率
lr_scheduler_type="constant", # 学習率スケジューラの種類
warmup_ratio=0.1, # 学習率のウォームアップの長さを指定
num_train_epochs=100, # エポック数
save_strategy="epoch", # チェックポイントの保存タイミング
logging_strategy="epoch", # ロギングのタイミング
evaluation_strategy="epoch", # 検証セットによる評価のタイミング
load_best_model_at_end=True, # 訓練後に開発セットで最良のモデルをロード
metric_for_best_model="accuracy", # 最良のモデルを決定する評価指標
fp16=True, # 自動混合精度演算の有効化
)
# %%
# メトリクスの定義
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
# %%
# 訓練の実行
from transformers import Trainer
from transformers import EarlyStoppingCallback
trainer = Trainer(
model=model,
train_dataset=encoded_train_dataset,
eval_dataset=encoded_valid_dataset,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
# %%
# モデルの保存
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
# 履歴の保存
history_df = pd.DataFrame(trainer.state.log_history)
history_df.to_csv('base_line/wrime_baseline_history.csv')
# %%
import matplotlib.pyplot as plt
def show_graph(df, suptitle, output='output.png'):
suptitle_size = 23
graph_title_size = 20
legend_size = 18
ticks_size = 13
# 学習曲線
fig = plt.figure(figsize=(20, 5))
plt.suptitle(suptitle, fontsize=suptitle_size)
# Train Loss
plt.subplot(131)
plt.title('Train Loss', fontsize=graph_title_size)
plt.plot(df['loss'].dropna(), label='train')
plt.legend(fontsize=legend_size)
plt.yticks(fontsize=ticks_size)
# Validation Loss
plt.subplot(132)
# reg_str = f'$y={round(regression.coef_[0],5)}*x+{round(regression.intercept_,3)}$'
plt.title(f'Val Loss', fontsize=graph_title_size)
y = df['eval_loss'].dropna().values
x = np.arange(len(y)).reshape(-1, 1)
# pred = regression.coef_ * x.ravel() + regression.intercept_ # 線形回帰直線
plt.plot(y, color='tab:orange', label='val')
# plt.plot(pred, color='green', label='pred')
plt.legend(fontsize=legend_size)
# plt.xlabel(reg_str, fontsize=ticks_size)
plt.yticks(fontsize=ticks_size)
# Accuracy/F1
plt.subplot(133)
plt.title('eval Accuracy/F1', fontsize=graph_title_size)
plt.plot(df['eval_accuracy'].dropna(), label='accuracy')
plt.plot(df['eval_f1'].dropna(), label='F1')
plt.legend(fontsize=legend_size)
plt.yticks(fontsize=ticks_size)
plt.tight_layout()
# plt.show()
plt.savefig(output)
# %%
# 結果を表示
suptitle = 'batch:32, lr:2e-5, type:constant'
show_graph(history_df, suptitle, 'base_line/wrime_baseline_output.png')
|