A-Funakoshi commited on
Commit
0b6d57c
·
1 Parent(s): f7cd8a6

Upload 9 files

Browse files
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "positive",
13
+ "1": "neutral",
14
+ "2": "negative"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "negative": 2,
20
+ "neutral": 1,
21
+ "positive": 0
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "tokenizer_class": "BertJapaneseTokenizer",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.33.2",
34
+ "type_vocab_size": 2,
35
+ "use_cache": true,
36
+ "vocab_size": 32000
37
+ }
finetuning_multilingual_01_base.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import torch
3
+ # GPUが使用可能か判断
4
+ if torch.cuda.is_available():
5
+ print('gpu is available')
6
+ else:
7
+ raise Exception('gpu is NOT available')
8
+
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ device
11
+
12
+ # %%
13
+ from datasets import load_dataset, DatasetDict
14
+ from transformers import AutoTokenizer
15
+ from transformers import AutoModelForSequenceClassification
16
+ from transformers import TrainingArguments
17
+ from transformers import Trainer
18
+ from sklearn.metrics import accuracy_score, f1_score
19
+ import numpy as np
20
+ import pandas as pd
21
+ import torch
22
+ import random
23
+
24
+ # %%
25
+ from transformers.trainer_utils import set_seed
26
+
27
+ # 乱数シードを42に固定
28
+ set_seed(42)
29
+
30
+ # %%
31
+ from pprint import pprint
32
+ from datasets import load_dataset
33
+
34
+ # Hugging Face Hub上のllm-book/wrime-sentimentのリポジトリから
35
+ # データを読み込む
36
+ train_dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese", split="train")
37
+ valid_dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese", split="validation")
38
+ # pprintで見やすく表示する
39
+ pprint(train_dataset)
40
+ pprint(valid_dataset)
41
+
42
+ # %%
43
+ train_dataset[0]
44
+
45
+ # %%
46
+ # トークナイザのロード
47
+ model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
48
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
49
+
50
+ # %%
51
+ # トークナイズ関数
52
+ # text列をトークナイズ
53
+ # label列をlabels列にセット
54
+ def preprocess_text(batch):
55
+ encoded_batch = tokenizer(batch['text'], max_length=512)
56
+ encoded_batch['labels'] = batch['label']
57
+ return encoded_batch
58
+
59
+ # %%
60
+ # トークナイズ処理(既存の列は削除)
61
+ encoded_train_dataset = train_dataset.map(
62
+ preprocess_text,
63
+ remove_columns=train_dataset.column_names,
64
+ )
65
+ encoded_valid_dataset = valid_dataset.map(
66
+ preprocess_text,
67
+ remove_columns=valid_dataset.column_names,
68
+ )
69
+
70
+ # %%
71
+ # ミニバッチ構築
72
+ from transformers import DataCollatorWithPadding
73
+
74
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
75
+
76
+ # %%
77
+ # モデルの準備
78
+ from transformers import AutoModelForSequenceClassification
79
+
80
+ class_label = train_dataset.features["label"]
81
+ label2id = {label: id for id, label in enumerate(class_label.names)}
82
+ id2label = {id: label for id, label in enumerate(class_label.names)}
83
+ model = AutoModelForSequenceClassification.from_pretrained(
84
+ model_name,
85
+ num_labels=class_label.num_classes,
86
+ label2id=label2id, # ラベル名からIDへの対応を指定
87
+ id2label=id2label, # IDからラベル名への対応を指定
88
+ )
89
+ print(type(model).__name__)
90
+
91
+ # %%
92
+ # メトリクスの定義
93
+ def compute_metrics(pred):
94
+ labels = pred.label_ids
95
+ preds = pred.predictions.argmax(-1)
96
+ f1 = f1_score(labels, preds, average="weighted")
97
+ acc = accuracy_score(labels, preds)
98
+ return {"accuracy": acc, "f1": f1}
99
+
100
+ # %%
101
+ # epoch: 100, early stopping
102
+ # 訓練の実行
103
+ from transformers import TrainingArguments
104
+ # 保存ディレクトリ
105
+ save_dir = f'bert-finetuned-multilingual-sentiments-base'
106
+
107
+ training_args = TrainingArguments(
108
+ output_dir=save_dir, # 結果の保存フォルダ
109
+ per_device_train_batch_size=32, # 訓練時のバッチサイズ
110
+ per_device_eval_batch_size=32, # 評価時のバッチサイズ
111
+ learning_rate=2e-5, # 学習率
112
+ lr_scheduler_type="constant", # 学習率スケジューラの種類
113
+ warmup_ratio=0.1, # 学習率のウォームアップの長さを指定
114
+ num_train_epochs=100, # エポック数
115
+ save_strategy="epoch", # チェックポイントの保存タイミング
116
+ logging_strategy="epoch", # ロギングのタイミング
117
+ evaluation_strategy="epoch", # 検証セットによる評価のタイミング
118
+ load_best_model_at_end=True, # 訓練後に開発セットで最良のモデルをロード
119
+ metric_for_best_model="eval_loss", # 最良のモデルを決定する評価指標
120
+ greater_is_better=False, # eval_lossは小さいほどよい
121
+ fp16=True, # 自動混合精度演算の有効化
122
+ )
123
+
124
+ # %%
125
+ from transformers import Trainer
126
+ from transformers import EarlyStoppingCallback
127
+
128
+ trainer = Trainer(
129
+ model=model,
130
+ train_dataset=encoded_train_dataset,
131
+ eval_dataset=encoded_valid_dataset,
132
+ data_collator=data_collator,
133
+ args=training_args,
134
+ compute_metrics=compute_metrics,
135
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
136
+ )
137
+ trainer.train()
138
+
139
+ # %%
140
+ # モデルの保存
141
+ trainer.save_model(save_dir)
142
+ tokenizer.save_pretrained(save_dir)
143
+ # 履歴の保存
144
+ history_df = pd.DataFrame(trainer.state.log_history)
145
+ history_df.to_csv('base_line/mullingual_baseline_history.csv')
146
+
147
+ # %%
148
+ import matplotlib.pyplot as plt
149
+
150
+ def show_graph(df, suptitle, output='output.png'):
151
+ suptitle_size = 23
152
+ graph_title_size = 20
153
+ legend_size = 18
154
+ ticks_size = 13
155
+ # 学習曲線
156
+ plt.figure(figsize=(20, 5))
157
+ plt.suptitle(suptitle, fontsize=suptitle_size)
158
+ # Train Loss
159
+ plt.subplot(131)
160
+ plt.title('Train Loss', fontsize=graph_title_size)
161
+ plt.plot(df['loss'].dropna(), label='train')
162
+ plt.legend(fontsize=legend_size)
163
+ plt.yticks(fontsize=ticks_size)
164
+ # Validation Loss
165
+ plt.subplot(132)
166
+ plt.title(f'Val Loss', fontsize=graph_title_size)
167
+ y = df['eval_loss'].dropna().values
168
+ x = np.arange(len(y)).reshape(-1, 1)
169
+ plt.plot(y, color='tab:orange', label='val')
170
+ plt.legend(fontsize=legend_size)
171
+ plt.yticks(fontsize=ticks_size)
172
+ # Accuracy/F1
173
+ plt.subplot(133)
174
+ plt.title('eval Accuracy/F1', fontsize=graph_title_size)
175
+ plt.plot(df['eval_accuracy'].dropna(), label='accuracy')
176
+ plt.plot(df['eval_f1'].dropna(), label='F1')
177
+ plt.legend(fontsize=legend_size)
178
+ plt.yticks(fontsize=ticks_size)
179
+ plt.tight_layout()
180
+ # plt.show()
181
+ plt.savefig(output)
182
+
183
+ # %%
184
+ # 結果を表示
185
+ suptitle = 'batch:32, lr:2e-5, type:constant'
186
+ show_graph(history_df, suptitle, 'base_line/mullingual_baseline_output.png')
mullingual_baseline_history.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ,loss,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
2
+ 0,0.4794,2e-05,1.0,3750,,,,,,,,,,,
3
+ 1,,,1.0,3750,0.41642841696739197,0.8346666666666667,0.8354288225130301,4.8505,618.487,19.379,,,,,
4
+ 2,0.3653,2e-05,2.0,7500,,,,,,,,,,,
5
+ 3,,,2.0,7500,0.4315239191055298,0.8363333333333334,0.8331066301394175,4.8532,618.146,19.369,,,,,
6
+ 4,0.2753,2e-05,3.0,11250,,,,,,,,,,,
7
+ 5,,,3.0,11250,0.4639851152896881,0.8326666666666667,0.8307922198552634,4.8504,618.504,19.38,,,,,
8
+ 6,0.1993,2e-05,4.0,15000,,,,,,,,,,,
9
+ 7,,,4.0,15000,0.5914837718009949,0.8223333333333334,0.8192726925927007,4.8531,618.167,19.369,,,,,
10
+ 8,,,4.0,15000,,,,,,,2569.2881,4670.555,145.955,6.146684641240013e+16,0.3298242228190104
mullingual_baseline_output.png ADDED
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:093bf6114a2b698ac241356bf234a1f418db0c98b78472b00ff63ba536ca7a73
3
+ size 442545135
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "do_subword_tokenize": true,
6
+ "do_word_tokenize": true,
7
+ "jumanpp_kwargs": null,
8
+ "mask_token": "[MASK]",
9
+ "mecab_kwargs": null,
10
+ "model_max_length": 512,
11
+ "never_split": null,
12
+ "pad_token": "[PAD]",
13
+ "sep_token": "[SEP]",
14
+ "subword_tokenizer_type": "wordpiece",
15
+ "sudachi_kwargs": null,
16
+ "tokenizer_class": "BertJapaneseTokenizer",
17
+ "unk_token": "[UNK]",
18
+ "word_tokenizer_type": "mecab"
19
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d634e3e980131e9cee849ecf9be40761538af01fbac073173af599a6d198b9d9
3
+ size 4079
vocab.txt ADDED
The diff for this file is too large to render. See raw diff