6023oji commited on
Commit
9a2142b
·
1 Parent(s): d2e814a

Upload krfinbert_esg (1).py

Browse files
Files changed (1) hide show
  1. krfinbert_esg (1).py +353 -0
krfinbert_esg (1).py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """krfinbert_esg.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_cVBwxsa7LcHzjzCcS4l1ds0wxNPQrjm
8
+ """
9
+
10
+ from google.colab import drive
11
+ drive.mount('/content/drive')
12
+
13
+ import pandas as pd
14
+ import numpy as np
15
+
16
+ import warnings
17
+ warnings.filterwarnings('ignore') # to avoid warnings
18
+
19
+ import random
20
+ import pandas as pd
21
+ from tqdm import tqdm
22
+ import seaborn as sns
23
+ import matplotlib.pyplot as plt
24
+
25
+ """
26
+ Sklearn Libraries
27
+ """
28
+ from sklearn.metrics import f1_score
29
+ from sklearn.model_selection import train_test_split
30
+
31
+ """
32
+ Transformer Libraries
33
+ """
34
+ !pip install transformers
35
+ from transformers import BertTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
36
+
37
+ """
38
+ Pytorch Libraries
39
+ """
40
+ import torch
41
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
42
+
43
+ esg_data = pd.read_csv("/content/drive/MyDrive/kpmg_personal/concat.csv",
44
+ encoding='utf-8')
45
+
46
+ esg_data
47
+
48
+ plt.figure(figsize = (15,8))
49
+
50
+ sns.set(style='darkgrid')
51
+
52
+ # Increase information on the figure
53
+ sns.set(font_scale=1.3)
54
+ sns.countplot(x='category', data = esg_data)
55
+ plt.title('ESG Category Distribution')
56
+ plt.xlabel('E,S,G,N')
57
+ plt.ylabel('Number of Contents')
58
+
59
+ def show_random_contents(total_number, df):
60
+
61
+ # Get the random number of reviews
62
+ n_contents = df.sample(total_number)
63
+
64
+ # Print each one of the reviews
65
+ for val in list(n_contents.index):
66
+ print("Contents #°{}".format(val))
67
+ print(" - Category: {}".format(df.iloc[val]["category"]))
68
+ print(" - Contents: {}".format(df.iloc[val]["contents"]))
69
+ print("")
70
+
71
+ # Show 5 random headlines
72
+ show_random_contents(5, esg_data)
73
+
74
+ def encode_categories_values(df):
75
+
76
+ possible_categories = df.category.unique()
77
+ category_dict = {}
78
+
79
+ for index, possible_category in enumerate(possible_categories):
80
+ category_dict[possible_category] = index
81
+
82
+ # Encode all the sentiment values
83
+ df['label'] = df.category.replace(category_dict)
84
+
85
+ return df, category_dict
86
+
87
+ # Perform the encoding task on the data set
88
+ esg_data, category_dict = encode_categories_values(esg_data)
89
+
90
+ X_train,X_val, y_train, y_val = train_test_split(esg_data.index.values,
91
+ esg_data.label.values,
92
+ test_size = 0.15,
93
+ random_state = 2022,
94
+ stratify = esg_data.label.values)
95
+
96
+ esg_data.loc[X_train, 'data_type'] = 'train'
97
+ esg_data.loc[X_val, 'data_type'] = 'val'
98
+
99
+ # Vizualiez the number of sentiment occurence on each type of data
100
+ esg_data.groupby(['category', 'label', 'data_type']).count()
101
+
102
+ # Get the FinBERT Tokenizer
103
+ finbert_tokenizer = BertTokenizer.from_pretrained('snunlp/KR-FinBert-SC',
104
+ do_lower_case=True)
105
+
106
+ def get_contents_len(df):
107
+
108
+ contents_sequence_lengths = []
109
+
110
+ print("Encoding in progress...")
111
+ for content in tqdm(df.contents):
112
+ encoded_content = finbert_tokenizer.encode(content,
113
+ add_special_tokens = True)
114
+
115
+ # record the length of the encoded review
116
+ contents_sequence_lengths.append(len(encoded_content))
117
+ print("End of Task.")
118
+
119
+ return contents_sequence_lengths
120
+
121
+ def show_contents_distribution(sequence_lengths, figsize = (15,8)):
122
+
123
+ # Get the percentage of reviews with length > 512
124
+ len_512_plus = [rev_len for rev_len in sequence_lengths if rev_len > 512]
125
+ percent = (len(len_512_plus)/len(sequence_lengths))*100
126
+
127
+ print("Maximum Sequence Length is {}".format(max(sequence_lengths)))
128
+
129
+ # Configure the plot size
130
+ plt.figure(figsize = figsize)
131
+
132
+ sns.set(style='darkgrid')
133
+
134
+ # Increase information on the figure
135
+ sns.set(font_scale=1.3)
136
+
137
+ # Plot the result
138
+ sns.distplot(sequence_lengths, kde = False, rug = False)
139
+ plt.title('Contents Lengths Distribution')
140
+ plt.xlabel('Contents Length')
141
+ plt.ylabel('Number of Contents')
142
+
143
+ show_contents_distribution(get_contents_len(esg_data))
144
+
145
+ # Encode the Training and Validation Data
146
+ encoded_data_train = finbert_tokenizer.batch_encode_plus(
147
+ esg_data[esg_data.data_type=='train'].contents.values,
148
+ return_tensors='pt',
149
+ add_special_tokens=True,
150
+ return_attention_mask=True,
151
+ pad_to_max_length=True,
152
+ max_length=200 # the maximum lenght observed in the headlines
153
+ )
154
+
155
+ encoded_data_val = finbert_tokenizer.batch_encode_plus(
156
+ esg_data[esg_data.data_type=='val'].contents.values,
157
+ return_tensors='pt',
158
+ add_special_tokens=True,
159
+ return_attention_mask=True,
160
+ pad_to_max_length=True,
161
+ max_length=200 # the maximum length observed in the headlines
162
+ )
163
+
164
+
165
+ input_ids_train = encoded_data_train['input_ids']
166
+ attention_masks_train = encoded_data_train['attention_mask']
167
+ labels_train = torch.tensor(esg_data[esg_data.data_type=='train'].label.values)
168
+
169
+ input_ids_val = encoded_data_val['input_ids']
170
+ attention_masks_val = encoded_data_val['attention_mask']
171
+ sentiments_val = torch.tensor(esg_data[esg_data.data_type=='val'].label.values)
172
+
173
+
174
+ dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
175
+ dataset_val = TensorDataset(input_ids_val, attention_masks_val, sentiments_val)
176
+
177
+ model = AutoModelForSequenceClassification.from_pretrained("snunlp/KR-FinBert-SC",
178
+ num_labels=len(category_dict),
179
+ output_attentions=False,
180
+ output_hidden_states=False,
181
+ ignore_mismatched_sizes=True)
182
+
183
+ batch_size = 5
184
+
185
+ dataloader_train = DataLoader(dataset_train,
186
+ sampler=RandomSampler(dataset_train),
187
+ batch_size=batch_size)
188
+
189
+ dataloader_validation = DataLoader(dataset_val,
190
+ sampler=SequentialSampler(dataset_val),
191
+ batch_size=batch_size)
192
+
193
+ optimizer = AdamW(model.parameters(),
194
+ lr=1e-5,
195
+ eps=1e-8)
196
+
197
+ epochs = 5
198
+
199
+ scheduler = get_linear_schedule_with_warmup(optimizer,
200
+ num_warmup_steps=0,
201
+ num_training_steps=len(dataloader_train)*epochs)
202
+
203
+ def f1_score_func(preds, labels):
204
+ preds_flat = np.argmax(preds, axis=1).flatten()
205
+ labels_flat = labels.flatten()
206
+ return f1_score(labels_flat, preds_flat, average='weighted')
207
+
208
+ def accuracy_per_class(preds, labels):
209
+ label_dict_inverse = {v: k for k, v in category_dict.items()}
210
+
211
+ preds_flat = np.argmax(preds, axis=1).flatten()
212
+ labels_flat = labels.flatten()
213
+
214
+ for label in np.unique(labels_flat):
215
+ y_preds = preds_flat[labels_flat==label]
216
+ y_true = labels_flat[labels_flat==label]
217
+ print(f'Class: {label_dict_inverse[label]}')
218
+ print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
219
+
220
+ seed_val = 2022
221
+ random.seed(seed_val)
222
+ np.random.seed(seed_val)
223
+ torch.manual_seed(seed_val)
224
+ torch.cuda.manual_seed_all(seed_val)
225
+
226
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
227
+ model.to(device)
228
+
229
+
230
+ def evaluate(dataloader_val):
231
+
232
+ model.eval()
233
+
234
+ loss_val_total = 0
235
+ predictions, true_vals = [], []
236
+
237
+ for batch in dataloader_val:
238
+
239
+ batch = tuple(b.to(device) for b in batch)
240
+
241
+ inputs = {'input_ids': batch[0],
242
+ 'attention_mask': batch[1],
243
+ 'labels': batch[2],
244
+ }
245
+
246
+ with torch.no_grad():
247
+ outputs = model(**inputs)
248
+
249
+ loss = outputs[0]
250
+ logits = outputs[1]
251
+ loss_val_total += loss.item()
252
+
253
+ logits = logits.detach().cpu().numpy()
254
+ label_ids = inputs['labels'].cpu().numpy()
255
+ predictions.append(logits)
256
+ true_vals.append(label_ids)
257
+
258
+ loss_val_avg = loss_val_total/len(dataloader_val)
259
+
260
+ predictions = np.concatenate(predictions, axis=0)
261
+ true_vals = np.concatenate(true_vals, axis=0)
262
+
263
+ return loss_val_avg, predictions, true_vals
264
+
265
+
266
+ for epoch in tqdm(range(1, epochs+1)):
267
+
268
+ model.train()
269
+
270
+ loss_train_total = 0
271
+
272
+ progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
273
+ for batch in progress_bar:
274
+
275
+ model.zero_grad()
276
+
277
+ batch = tuple(b.to(device) for b in batch)
278
+
279
+ inputs = {'input_ids': batch[0],
280
+ 'attention_mask': batch[1],
281
+ 'labels': batch[2],
282
+ }
283
+
284
+ outputs = model(**inputs)
285
+
286
+ loss = outputs[0]
287
+ loss_train_total += loss.item()
288
+ loss.backward()
289
+
290
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
291
+
292
+ optimizer.step()
293
+ scheduler.step()
294
+
295
+ progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
296
+
297
+ torch.save(model.state_dict(), f'finetuned_finBERT_epoch_{epoch}.model')
298
+
299
+ tqdm.write(f'\nEpoch {epoch}')
300
+
301
+ loss_train_avg = loss_train_total/len(dataloader_train)
302
+ tqdm.write(f'Training loss: {loss_train_avg}')
303
+
304
+ val_loss, predictions, true_vals = evaluate(dataloader_validation)
305
+ val_f1 = f1_score_func(predictions, true_vals)
306
+ tqdm.write(f'Validation loss: {val_loss}')
307
+ tqdm.write(f'F1 Score (Weighted): {val_f1}')
308
+
309
+ model = AutoModelForSequenceClassification.from_pretrained("snunlp/KR-FinBert-SC",
310
+ num_labels=len(category_dict),
311
+ output_attentions=False,
312
+ output_hidden_states=False,
313
+ ignore_mismatched_sizes=True)
314
+
315
+ model.to(device)
316
+
317
+ model.load_state_dict(torch.load('finetuned_finBERT_epoch_4.model',
318
+ map_location=torch.device('cpu')))
319
+
320
+ _, predictions, true_vals = evaluate(dataloader_validation)
321
+
322
+ accuracy_per_class(predictions, true_vals)
323
+
324
+ # max_length = 200
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+