ducdatit2002 commited on
Commit
e09333c
·
verified ·
1 Parent(s): 8fe0454

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. .vector_cache/word2vec_vi_syllables_100dims.txt.pt +3 -0
  3. abbreviations.json +363 -0
  4. bilstm_best.keras +3 -0
  5. bilstm_emotion_model/bilstm_model.keras +3 -0
  6. bilstm_emotion_model/classification_report.txt +33 -0
  7. bilstm_emotion_model/label_mapping.json +9 -0
  8. bilstm_emotion_model/vocabulary.json +0 -0
  9. cnn_lstm_best.keras +3 -0
  10. cnn_lstm_emotion_model/classification_report.txt +33 -0
  11. cnn_lstm_emotion_model/cnn_lstm_model.keras +3 -0
  12. cnn_lstm_model.keras +3 -0
  13. flagged/log.csv +2 -0
  14. logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0 +3 -0
  15. logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1 +3 -0
  16. logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0 +3 -0
  17. logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0 +3 -0
  18. logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0 +3 -0
  19. logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0 +3 -0
  20. logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0 +3 -0
  21. logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0 +3 -0
  22. logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1 +3 -0
  23. logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0 +3 -0
  24. logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0 +3 -0
  25. logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1 +3 -0
  26. logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0 +3 -0
  27. logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0 +3 -0
  28. logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1 +3 -0
  29. logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0 +3 -0
  30. logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0 +3 -0
  31. logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0 +3 -0
  32. logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1 +3 -0
  33. main_BILSTM.py +573 -0
  34. main_RNN_CNN-LSTM.py +738 -0
  35. main_lstm.py +289 -0
  36. main_phobert.py +349 -0
  37. main_svm.py +261 -0
  38. main_v1.py +494 -0
  39. phobert_emotion_model/classification_report.txt +23 -0
  40. phobert_emotion_model/confusion_matrix.png +0 -0
  41. phobert_emotion_model/id2label.json +9 -0
  42. phobert_emotion_model/phobert_emotion_model/added_tokens.json +3 -0
  43. phobert_emotion_model/phobert_emotion_model/bpe.codes +0 -0
  44. phobert_emotion_model/phobert_emotion_model/config.json +48 -0
  45. phobert_emotion_model/phobert_emotion_model/model.safetensors +3 -0
  46. phobert_emotion_model/phobert_emotion_model/special_tokens_map.json +9 -0
  47. phobert_emotion_model/phobert_emotion_model/tokenizer_config.json +54 -0
  48. phobert_emotion_model/phobert_emotion_model/vocab.txt +0 -0
  49. phobert_results/checkpoint-10410/added_tokens.json +3 -0
  50. phobert_results/checkpoint-10410/bpe.codes +0 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ bilstm_best.keras filter=lfs diff=lfs merge=lfs -text
37
+ bilstm_emotion_model/bilstm_model.keras filter=lfs diff=lfs merge=lfs -text
38
+ cnn_lstm_best.keras filter=lfs diff=lfs merge=lfs -text
39
+ cnn_lstm_emotion_model/cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
40
+ cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
41
+ processed.xlsx filter=lfs diff=lfs merge=lfs -text
42
+ processed_phobert.xlsx filter=lfs diff=lfs merge=lfs -text
43
+ processed_svm.xlsx filter=lfs diff=lfs merge=lfs -text
44
+ train.xlsx filter=lfs diff=lfs merge=lfs -text
45
+ word2vec_vi_syllables_100dims.txt filter=lfs diff=lfs merge=lfs -text
.vector_cache/word2vec_vi_syllables_100dims.txt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3390520329ebe14cddb38384d80bd8b6e4948e023977ba5dbe32235b4a3503e7
3
+ size 418631353
abbreviations.json ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ad": [
3
+ "admin",
4
+ "quản trị viên"
5
+ ],
6
+ "bb": [
7
+ "bye bye",
8
+ "tạm biệt"
9
+ ],
10
+ "bl": [
11
+ "bình luận"
12
+ ],
13
+ "bth": [
14
+ "bình thường"
15
+ ],
16
+ "bmn": [
17
+ "bạn muốn"
18
+ ],
19
+ "cxk": [
20
+ "cũng không"
21
+ ],
22
+ "đm": [
23
+ "đ** m**"
24
+ ],
25
+ "gg": [
26
+ "good game",
27
+ "Google"
28
+ ],
29
+ "hc": [
30
+ "học"
31
+ ],
32
+ "kq": [
33
+ "kết quả"
34
+ ],
35
+ "kb": [
36
+ "kết bạn"
37
+ ],
38
+ "khá": [
39
+ "khá là"
40
+ ],
41
+ "lq": [
42
+ "liên quan"
43
+ ],
44
+ "lmh": [
45
+ "làm gì thế"
46
+ ],
47
+ "ng": [
48
+ "người"
49
+ ],
50
+ "nsao": [
51
+ "nói sao"
52
+ ],
53
+ "nv": [
54
+ "nhân vật"
55
+ ],
56
+ "nvay": [
57
+ "như vậy"
58
+ ],
59
+ "nxk": [
60
+ "nói không"
61
+ ],
62
+ "ob": [
63
+ "ông bà"
64
+ ],
65
+ "pc": [
66
+ "phải không"
67
+ ],
68
+ "ph": [
69
+ "phim"
70
+ ],
71
+ "ql": [
72
+ "quản lý"
73
+ ],
74
+ "qt": [
75
+ "quá trời"
76
+ ],
77
+ "sdt": [
78
+ "số điện thoại"
79
+ ],
80
+ "sk": [
81
+ "sức khỏe"
82
+ ],
83
+ "tc": [
84
+ "tài chính"
85
+ ],
86
+ "td": [
87
+ "tâm điểm",
88
+ "tập đoàn"
89
+ ],
90
+ "th": [
91
+ "thôi"
92
+ ],
93
+ "tl": [
94
+ "trả lời"
95
+ ],
96
+ "ty": [
97
+ "tình yêu"
98
+ ],
99
+ "up": [
100
+ "cập nhật",
101
+ "update"
102
+ ],
103
+ "xđ": [
104
+ "xác định"
105
+ ],
106
+ "zui": [
107
+ "vui"
108
+ ],
109
+ "zời": [
110
+ "trời"
111
+ ],
112
+ "hdsd": [
113
+ "hướng dẫn sử dụng"
114
+ ],
115
+ "bbq": [
116
+ "barbecue",
117
+ "tiệc nướng"
118
+ ],
119
+ "cx": [
120
+ "chắc chắn",
121
+ "cũng"
122
+ ],
123
+ "vkc": [
124
+ "vãi kinh"
125
+ ],
126
+ "kt": [
127
+ "kiểm tra",
128
+ "không thèm"
129
+ ],
130
+ "tks": [
131
+ "thanks",
132
+ "cảm ơn"
133
+ ],
134
+ "đg": [
135
+ "đang"
136
+ ],
137
+ "qa": [
138
+ "quá"
139
+ ],
140
+ "ht": [
141
+ "học tập",
142
+ "hoàn tất"
143
+ ],
144
+ "clgt": [
145
+ "cái l** gì thế"
146
+ ],
147
+ "pls": [
148
+ "please",
149
+ "làm ơn"
150
+ ],
151
+ "qtqđ": [
152
+ "quá trời quá đất"
153
+ ],
154
+ "klq": [
155
+ "không liên quan"
156
+ ],
157
+ "mn": [
158
+ "mọi người"
159
+ ],
160
+ "vc": [
161
+ "vãi chưởng",
162
+ "vợ chồng"
163
+ ],
164
+ "vch": [
165
+ "vãi chưởng"
166
+ ],
167
+ "cđ": [
168
+ "cuộc đời"
169
+ ],
170
+ "đhs": [
171
+ "đ** hiểu sao"
172
+ ],
173
+ "ib": [
174
+ "inbox",
175
+ "nhắn tin"
176
+ ],
177
+ "ttyl": [
178
+ "talk to you later",
179
+ "nói chuyện sau"
180
+ ],
181
+ "stt": [
182
+ "status",
183
+ "trạng thái"
184
+ ],
185
+ "sr": [
186
+ "sorry",
187
+ "xin lỗi"
188
+ ],
189
+ "bn": [
190
+ "bao nhiêu",
191
+ "bạn"
192
+ ],
193
+ "ckmnl": [
194
+ "chào cả nhà mình nha l"
195
+ ],
196
+ "cr": [
197
+ "crush"
198
+ ],
199
+ "mng": [
200
+ "mọi người"
201
+ ],
202
+ "vl": [
203
+ "vãi l",
204
+ "rất"
205
+ ],
206
+ "khbn": [
207
+ "không biết nữa"
208
+ ],
209
+ "qtq": [
210
+ "quá trời quá"
211
+ ],
212
+ "sml": [
213
+ "sấp mặt luôn"
214
+ ],
215
+ "ns": [
216
+ "nói"
217
+ ],
218
+ "ăn h": [
219
+ "ăn hành"
220
+ ],
221
+ "qh": [
222
+ "quan hệ"
223
+ ],
224
+ "ăn b": [
225
+ "ăn bánh"
226
+ ],
227
+ "hph": [
228
+ "hạnh phúc"
229
+ ],
230
+ "ngta": [
231
+ "người ta"
232
+ ],
233
+ "mnk": [
234
+ "mọi người không"
235
+ ],
236
+ "ahihi": [
237
+ "cười đùa"
238
+ ],
239
+ "chz": [
240
+ "chuyện"
241
+ ],
242
+ "vđ": [
243
+ "vấn đề"
244
+ ],
245
+ "pp": [
246
+ "bye bye",
247
+ "tạm biệt"
248
+ ],
249
+ "dc": [
250
+ "được"
251
+ ],
252
+ "nt": [
253
+ "nhắn tin"
254
+ ],
255
+ "thik": [
256
+ "thích"
257
+ ],
258
+ "bt": [
259
+ "biết",
260
+ "bình thường"
261
+ ],
262
+ "kp": [
263
+ "không phải"
264
+ ],
265
+ "mik": [
266
+ "mình"
267
+ ],
268
+ "lm": [
269
+ "làm"
270
+ ],
271
+ "nx": [
272
+ "nữa"
273
+ ],
274
+ "mk": [
275
+ "mình",
276
+ "mày"
277
+ ],
278
+ "cmt": [
279
+ "comment",
280
+ "bình luận"
281
+ ],
282
+ "rep": [
283
+ "trả lời",
284
+ "phản hồi"
285
+ ],
286
+ "fa": [
287
+ "độc thân",
288
+ "forever alone"
289
+ ],
290
+ "chx": [
291
+ "chưa"
292
+ ],
293
+ "qlq": [
294
+ "quản lý quán"
295
+ ],
296
+ "a": [
297
+ "anh"
298
+ ],
299
+ "e": [
300
+ "em"
301
+ ],
302
+ "ko": [
303
+ "không"
304
+ ],
305
+ "kh": [
306
+ "không"
307
+ ],
308
+ "z": [
309
+ "vậy"
310
+ ],
311
+ "ny": [
312
+ "người yêu"
313
+ ],
314
+ "l": [
315
+ "là"
316
+ ],
317
+ "sn": [
318
+ "sinh nhật"
319
+ ],
320
+ "ckk": [
321
+ "chúc ngủ ngon"
322
+ ],
323
+ "hpbd": [
324
+ "happy birthday"
325
+ ],
326
+ "tt": [
327
+ "thông tin",
328
+ "tương tác"
329
+ ],
330
+ "ms": [
331
+ "mới"
332
+ ],
333
+ "k": [
334
+ "không"
335
+ ],
336
+ "vk": [
337
+ "vợ"
338
+ ],
339
+ "ck": [
340
+ "chồng"
341
+ ],
342
+ "j": [
343
+ "gì"
344
+ ],
345
+ "m": [
346
+ "mày"
347
+ ],
348
+ "t": [
349
+ "tao"
350
+ ],
351
+ "sgk": [
352
+ "sách giáo khoa"
353
+ ],
354
+ "cv": [
355
+ "công việc"
356
+ ],
357
+ "pv": [
358
+ "phục vụ"
359
+ ],
360
+ "dth":["dễ thương"],
361
+ "gato": ["ghen ăn tức ở"]
362
+
363
+ }
bilstm_best.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:085cb3b7394a3db69287c6ede56834dfc9d6e56e2f169c5a05e49ffb5267fb6a
3
+ size 13203552
bilstm_emotion_model/bilstm_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40715c89bc3bc193a953c792527898450dd10979bd0bcd62ed32b8df471fa2bb
3
+ size 13203552
bilstm_emotion_model/classification_report.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========== BiLSTM Classification Report ==========
2
+ precision recall f1-score support
3
+
4
+ Enjoyment 0.6490 0.7296 0.6869 991
5
+ Fear 0.5580 0.4709 0.5108 327
6
+ Sadness 0.4580 0.4747 0.4662 356
7
+ Anger 0.6587 0.6748 0.6667 369
8
+ Other 0.6601 0.6733 0.6667 600
9
+ Disgust 0.4967 0.4488 0.4715 332
10
+ Surprise 0.4683 0.3620 0.4083 326
11
+
12
+ accuracy 0.5956 3301
13
+ macro avg 0.5641 0.5477 0.5539 3301
14
+ weighted avg 0.5893 0.5956 0.5905 3301
15
+
16
+ ========== Additional Metrics ==========
17
+ Test Loss: 2.0363
18
+ Test Accuracy: 0.5956
19
+ Precision (Macro): 0.5641
20
+ Precision (Weighted): 0.5893
21
+ Recall (Macro): 0.5477
22
+ Recall (Weighted): 0.5956
23
+ F1-Score (Macro): 0.5539
24
+ F1-Score (Weighted): 0.5905
25
+
26
+ ========== Confusion Matrix ==========
27
+ [[723 23 83 3 81 29 49]
28
+ [ 38 154 26 72 10 14 13]
29
+ [108 14 169 2 30 23 10]
30
+ [ 13 42 12 249 14 29 10]
31
+ [110 9 30 9 404 18 20]
32
+ [ 32 25 26 30 38 149 32]
33
+ [ 90 9 23 13 35 38 118]]
bilstm_emotion_model/label_mapping.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Enjoyment": 0,
3
+ "Fear": 1,
4
+ "Sadness": 2,
5
+ "Anger": 3,
6
+ "Other": 4,
7
+ "Disgust": 5,
8
+ "Surprise": 6
9
+ }
bilstm_emotion_model/vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
cnn_lstm_best.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e98590341cdfcc831873ee3fddc3c17f16a350085df1e302e2e22a4eda0c03ad
3
+ size 13535600
cnn_lstm_emotion_model/classification_report.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========== CNN-LSTM Classification Report ==========
2
+ precision recall f1-score support
3
+
4
+ Enjoyment 0.6977 0.7265 0.7118 991
5
+ Fear 0.5526 0.6269 0.5874 327
6
+ Sadness 0.4955 0.4663 0.4805 356
7
+ Anger 0.7022 0.6070 0.6512 369
8
+ Other 0.6740 0.7650 0.7166 600
9
+ Disgust 0.5194 0.4849 0.5016 332
10
+ Surprise 0.5020 0.3896 0.4387 326
11
+
12
+ accuracy 0.6247 3301
13
+ macro avg 0.5919 0.5809 0.5840 3301
14
+ weighted avg 0.6204 0.6247 0.6205 3301
15
+
16
+ ========== Additional Metrics ==========
17
+ Test Loss: 1.6124
18
+ Test Accuracy: 0.6247
19
+ Precision (Macro): 0.5919
20
+ Precision (Weighted): 0.6204
21
+ Recall (Macro): 0.5809
22
+ Recall (Weighted): 0.6247
23
+ F1-Score (Macro): 0.5840
24
+ F1-Score (Weighted): 0.6205
25
+
26
+ ========== Confusion Matrix ==========
27
+ [[720 28 69 11 93 37 33]
28
+ [ 34 205 13 39 10 14 12]
29
+ [ 92 22 166 7 31 19 19]
30
+ [ 13 62 13 224 17 34 6]
31
+ [ 56 15 29 6 459 10 25]
32
+ [ 34 21 22 27 36 161 31]
33
+ [ 83 18 23 5 35 35 127]]
cnn_lstm_emotion_model/cnn_lstm_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c45256b322b2360c9ba9e0c5da5fd42705f7d4395f6c1d4c6a94035e43bf05d0
3
+ size 13535600
cnn_lstm_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78c966f03f234f409270b699f84a635d98128de271d8492ee25776026312cd24
3
+ size 13535600
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Nhập câu cần phân loại cảm xúc,Kết quả dự đoán,flag,username,timestamp
2
+ "Hôm nay là ngày đẹp trời, tôi muốn có người yêu 😊",Disgust,,,2025-01-14 13:57:25.419643
logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb26f251abccb92c7342c443b6b7c7faa2b0d0c41976053706f1c002754680a
3
+ size 23650
logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72bc950b1e422eb9db07cba8ad85db543521c38025579fcc2cce1dd799313233
3
+ size 411
logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:744768bef1c4f7e54446c6a7925c8b770d2d5af70f6f76016fab9805a3802b6f
3
+ size 346
logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0843cbd924008b8a37ef65480d32b8e16241e9e059a3784b0b8ce6d097a0d0c5
3
+ size 346
logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c3fc1113ddc32236fc69e785dfa73481178e728dd02e131bad5add13004729f
3
+ size 346
logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3de874ab406b8d42e3f02443b3ae8fce7228cffb61c6845aab400981d1263b0
3
+ size 5228
logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f927f800053a89cf20a14bf5a48c6343b31d9a49d5e670a4fc48ad7fb676874
3
+ size 8712
logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2816f60b911788c30bc43168dbbe689eee10a119e1e450767e54f521cb5f03c
3
+ size 81906
logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:426ac92bb076d56fd8130e04ac0064542681f9ddd70fbeb64779f10b8521bb1d
3
+ size 417
logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ff2d9a713d3ea47e04c6361df3c62d551e983cd170de4a163798e58eed51111
3
+ size 346
logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2cea1a1f21eb664b3b5ae8f09ae76a38a3c7a37560a4432c805772a8afb171b
3
+ size 83399
logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e9817a200d06938057f30fdac643b1480e734857bb5337aa4f494b29d199245
3
+ size 569
logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48134c412b09adeae17bc7aac0295e48dce80cf72ce2a1f4109c159ee99819b1
3
+ size 486
logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0165be0e6c2731ce32b3e3cbe11b5a6997120211c06d0d04c264b5c69c8f9f2
3
+ size 83399
logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e468b0b65d952e3df6c9eb4f53bb8a8f867532828522b13b8229b53ea2787f9a
3
+ size 569
logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a67cb94b4913d02142ea7fb0bbad62005700059dc0bc6670464999d33dce0daf
3
+ size 7756
logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a882fae8ea63fa2ecf17da9e9c44bcd33568c5a998b11da0ceb6c537857223c
3
+ size 7367
logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dcadbf84e08ca0d1c9cf9f877233b857eb144b8aa92bd28291827220a0f7ea6
3
+ size 85351
logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eee809e23d4dd927f9c3dffb75d8184a24ae246cd0380fc93894bccc415d632
3
+ size 766
main_BILSTM.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # thesis.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import pandas as pd
5
+ import emoji
6
+ import json
7
+ import re
8
+ import numpy as np
9
+ from underthesea import word_tokenize
10
+ from tqdm import tqdm
11
+ import torch
12
+ from torchtext.vocab import Vectors
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.utils import resample
15
+ from sklearn.metrics import (
16
+ accuracy_score,
17
+ classification_report,
18
+ precision_score,
19
+ recall_score,
20
+ f1_score,
21
+ confusion_matrix
22
+ )
23
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
24
+ from torch.utils.data import DataLoader, TensorDataset
25
+ import torch.nn as nn
26
+ import torch.optim as optim
27
+ import tensorflow as tf
28
+ import os
29
+
30
+ # ========== CÁC HÀM TIỀN XỬ LÝ ==========
31
+
32
+ def preprocess_sentence(sentence, abbreviations, emoji_mapping):
33
+ """
34
+ Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
35
+ ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
36
+ """
37
+ sentence = sentence.lower()
38
+ sentence = replace_emojis(sentence, emoji_mapping)
39
+ sentence = remove_profanity(sentence)
40
+ sentence = remove_special_characters(sentence)
41
+ sentence = normalize_whitespace(sentence)
42
+ sentence = replace_abbreviations(sentence, abbreviations)
43
+ sentence = remove_repeated_characters(sentence)
44
+ sentence = replace_numbers(sentence)
45
+ sentence = tokenize_sentence(sentence)
46
+ return sentence
47
+
48
+ def replace_emojis(sentence, emoji_mapping):
49
+ processed_sentence = []
50
+ for char in sentence:
51
+ if char in emoji_mapping:
52
+ processed_sentence.append(emoji_mapping[char])
53
+ elif not emoji.is_emoji(char):
54
+ processed_sentence.append(char)
55
+ return ''.join(processed_sentence)
56
+
57
+ def remove_profanity(sentence):
58
+ profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
59
+ words = sentence.split()
60
+ filtered_words = [word for word in words if word.lower() not in profane_words]
61
+ return ' '.join(filtered_words)
62
+
63
+ def remove_special_characters(sentence):
64
+ return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
65
+
66
+ def normalize_whitespace(sentence):
67
+ return ' '.join(sentence.split())
68
+
69
+ def replace_abbreviations(sentence, abbreviations):
70
+ words = sentence.split()
71
+ replaced_words = [
72
+ " ".join(abbreviations[word]) if word in abbreviations else word
73
+ for word in words
74
+ ]
75
+ return ' '.join(replaced_words)
76
+
77
+ def remove_repeated_characters(sentence):
78
+ # Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
79
+ return re.sub(r"(.)\1{2,}", r"\1", sentence)
80
+
81
+ def replace_numbers(sentence):
82
+ # Thay toàn bộ số bằng token [number]
83
+ return re.sub(r"\d+", "[number]", sentence)
84
+
85
+ def tokenize_sentence(sentence):
86
+ # Tách từ bằng underthesea
87
+ return ' '.join(word_tokenize(sentence))
88
+
89
+ # ========== VOCABULARY CLASS ==========
90
+
91
+ class Vocabulary:
92
+ def __init__(self):
93
+ self.word2id = {}
94
+ self.word2id['<pad>'] = 0
95
+ self.word2id['<unk>'] = 1
96
+ self.unk_id = 1
97
+ self.id2word = {0: '<pad>', 1: '<unk>'}
98
+
99
+ def __getitem__(self, word):
100
+ return self.word2id.get(word, self.unk_id)
101
+
102
+ def __contains__(self, word):
103
+ return word in self.word2id
104
+
105
+ def __len__(self):
106
+ return len(self.word2id)
107
+
108
+ def lookup_tokens(self, indices):
109
+ return [self.id2word[idx] for idx in indices]
110
+
111
+ def add(self, word):
112
+ if word not in self.word2id:
113
+ idx = len(self.word2id)
114
+ self.word2id[word] = idx
115
+ self.id2word[idx] = word
116
+
117
+ @staticmethod
118
+ def tokenize_corpus(corpus):
119
+ tokenized_corpus = []
120
+ for doc in tqdm(corpus, desc="Tokenizing Corpus"):
121
+ tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
122
+ tokenized_corpus.append(tokens)
123
+ return tokenized_corpus
124
+
125
+ def corpus_to_tensor(self, corpus, is_tokenized=False):
126
+ """
127
+ corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
128
+ return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
129
+ """
130
+ tokenized_corpus = (
131
+ self.tokenize_corpus(corpus) if not is_tokenized else corpus
132
+ )
133
+ return [
134
+ [self[token] for token in doc]
135
+ for doc in tokenized_corpus
136
+ ]
137
+
138
+ # ========== EMOJI MAPPING ==========
139
+
140
+ emoji_mapping = {
141
+ "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
142
+ "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
143
+ "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
144
+ "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
145
+ "🤑": "[satisfaction]",
146
+ "🤐": "[neutral]", "🤨": "[neutral]", "���": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
147
+ "😏": "[sarcasm]",
148
+ "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
149
+ "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
150
+ "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
151
+ "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
152
+ "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
153
+ "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
154
+ "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
155
+ "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
156
+ "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
157
+ "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
158
+ }
159
+
160
+ # ========== DATA MANAGER ==========
161
+
162
+ class DataManager:
163
+ def __init__(self, file_path, abbreviations_path, word2vec_path):
164
+ self.file_path = file_path
165
+ self.abbreviations_path = abbreviations_path
166
+ self.word2vec_path = word2vec_path
167
+ self.vocabulary = None
168
+ self.word_embeddings = None
169
+ self.abbreviations = None
170
+ self.load_abbreviations()
171
+
172
+ def load_abbreviations(self):
173
+ with open(self.abbreviations_path, "r", encoding="utf-8") as f:
174
+ self.abbreviations = json.load(f)
175
+
176
+ def load_word2vec(self):
177
+ """
178
+ Tải vector từ file word2vec,
179
+ dùng torchtext.Vectors để load embedding pretrained.
180
+ """
181
+ self.word_embeddings = Vectors(
182
+ name=self.word2vec_path,
183
+ unk_init=torch.Tensor.normal_
184
+ )
185
+
186
+ def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
187
+ """
188
+ Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
189
+ """
190
+ vocab = Vocabulary()
191
+ from collections import Counter
192
+ counter = Counter()
193
+
194
+ for sent in corpus:
195
+ for token in sent.split():
196
+ counter[token] += 1
197
+
198
+ most_common = counter.most_common(max_vocab_size)
199
+ for word, _freq in most_common:
200
+ vocab.add(word)
201
+
202
+ return vocab
203
+
204
+ def preprocess_data(self):
205
+ df = pd.read_excel(self.file_path)
206
+ if "Sentence" not in df.columns:
207
+ raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
208
+
209
+ # Tiền xử lý từng câu
210
+ df["processed_sentence"] = df["Sentence"].apply(
211
+ lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
212
+ )
213
+
214
+ # Loại những dòng rỗng
215
+ df = df[df["processed_sentence"].str.strip().astype(bool)]
216
+
217
+ # Tạo vocab từ chính dữ liệu
218
+ all_sentences = df["processed_sentence"].tolist()
219
+ self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
220
+
221
+ # Load word2vec
222
+ self.load_word2vec()
223
+
224
+ return df
225
+
226
+ def build_pretrained_embedding_matrix(self, embedding_dim=100):
227
+ """
228
+ Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
229
+ với trọng số pretrained.
230
+ """
231
+ vocab_size = len(self.vocabulary)
232
+ weight_matrix = np.random.normal(
233
+ scale=0.1, size=(vocab_size, embedding_dim)
234
+ ).astype(np.float32)
235
+
236
+ # Copy vector pretrained
237
+ for word, idx in self.vocabulary.word2id.items():
238
+ if word in self.word_embeddings.stoi:
239
+ weight_matrix[idx] = self.word_embeddings.vectors[
240
+ self.word_embeddings.stoi[word]
241
+ ]
242
+
243
+ return weight_matrix
244
+
245
+ def split_and_convert(
246
+ self, df, label_column="Emotion", maxlen=400, test_size=0.2,
247
+ for_keras=False, batch_size=32
248
+ ):
249
+ """
250
+ Chia dữ liệu thành train/test hoặc train/val/test.
251
+ - for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
252
+ - for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
253
+ """
254
+ if label_column not in df.columns:
255
+ raise ValueError(
256
+ f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
257
+ )
258
+
259
+ # Tạo mapping nhãn -> số
260
+ label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
261
+ df[label_column] = df[label_column].map(label_mapping)
262
+ if df[label_column].isnull().any():
263
+ missing = df[df[label_column].isnull()][label_column].unique()
264
+ raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
265
+
266
+ X = df["processed_sentence"].tolist()
267
+ y = df[label_column].tolist()
268
+
269
+ # Stratify để duy trì phân phối lớp
270
+ X_train, X_test, y_train, y_test = train_test_split(
271
+ X, y, test_size=test_size, random_state=42, stratify=y
272
+ )
273
+
274
+ if not for_keras:
275
+ # Chia train thành train và validation
276
+ X_train, X_val, y_train, y_val = train_test_split(
277
+ X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
278
+ )
279
+
280
+ # Convert text -> index
281
+ X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
282
+ X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
283
+
284
+ if not for_keras:
285
+ X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
286
+
287
+ # Pad
288
+ X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
289
+ X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')
290
+
291
+ if not for_keras:
292
+ X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
293
+
294
+ print(">>> Debug Split and Convert:")
295
+ print("X_train_padded.shape:", X_train_padded.shape)
296
+ print("X_val_padded.shape: ", X_val_padded.shape if not for_keras else "N/A")
297
+ print("X_test_padded.shape: ", X_test_padded.shape)
298
+ print("y_train length:", len(y_train))
299
+ print("y_val length: ", len(y_val) if not for_keras else "N/A")
300
+ print("y_test length: ", len(y_test))
301
+ print("vocab_size:", len(self.vocabulary))
302
+
303
+ if for_keras:
304
+ num_classes = len(label_mapping)
305
+ y_train_onehot = tf.keras.utils.to_categorical(
306
+ y_train,
307
+ num_classes=num_classes
308
+ )
309
+ y_test_onehot = tf.keras.utils.to_categorical(
310
+ y_test,
311
+ num_classes=num_classes
312
+ )
313
+
314
+ print("y_train_onehot.shape:", y_train_onehot.shape)
315
+ print("y_test_onehot.shape: ", y_test_onehot.shape)
316
+
317
+ return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
318
+ else:
319
+ # Convert validation set
320
+ X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
321
+ X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
322
+
323
+ X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
324
+ X_val_t = torch.tensor(X_val_padded, dtype=torch.long)
325
+ X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
326
+ y_train_t = torch.tensor(y_train, dtype=torch.long)
327
+ y_val_t = torch.tensor(y_val, dtype=torch.long)
328
+ y_test_t = torch.tensor(y_test, dtype=torch.long)
329
+
330
+ train_ds = TensorDataset(X_train_t, y_train_t)
331
+ val_ds = TensorDataset(X_val_t, y_val_t)
332
+ test_ds = TensorDataset(X_test_t, y_test_t)
333
+
334
+ train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
335
+ val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
336
+ test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
337
+
338
+ return train_loader, val_loader, test_loader, label_mapping
339
+
340
+ # ========== MÔ HÌNH KERAS BI-LSTM ==========
341
+
342
+ def predict_emotion_bilstm(model, text, data_manager, label_mapping):
343
+ processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
344
+ tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
345
+ text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
346
+ text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
347
+ output = model.predict(text_padded)
348
+ pred = output.argmax(axis=1)[0]
349
+ rev_map = {v: k for k, v in label_mapping.items()}
350
+ return rev_map[pred]
351
+
352
+ # ========== MAIN ==========
353
+
354
+ if __name__ == "__main__":
355
+ from keras.models import Model
356
+ from keras.layers import (
357
+ Input, Embedding, Dense, Dropout, Bidirectional, LSTM
358
+ )
359
+ from keras.optimizers import Adam
360
+ from keras.callbacks import ModelCheckpoint, EarlyStopping
361
+
362
+ # -------- ĐƯỜNG DẪN ----------
363
+ file_path = "train.xlsx"
364
+ abbreviations_path = "abbreviations.json"
365
+ word2vec_path = "word2vec_vi_syllables_100dims.txt"
366
+ output_path = "processed.xlsx"
367
+
368
+ # Khởi tạo DataManager
369
+ data_manager = DataManager(
370
+ file_path=file_path,
371
+ abbreviations_path=abbreviations_path,
372
+ word2vec_path=word2vec_path
373
+ )
374
+
375
+ # 1) Tiền xử lý, tạo vocab, load word2vec
376
+ df = data_manager.preprocess_data()
377
+ print("Trước khi cân bằng lớp (undersampling/oversampling):")
378
+ print(df["Emotion"].value_counts())
379
+
380
+ # 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
381
+ # Bạn có thể điều chỉnh theo nhu cầu của mình
382
+ df_enjoyment = df[df["Emotion"] == "Enjoyment"]
383
+ df_other = df[df["Emotion"] == "Other"]
384
+ df_anger = df[df["Emotion"] == "Anger"]
385
+ df_sadness = df[df["Emotion"] == "Sadness"]
386
+ df_disgust = df[df["Emotion"] == "Disgust"]
387
+ df_fear = df[df["Emotion"] == "Fear"]
388
+ df_surprise = df[df["Emotion"] == "Surprise"]
389
+
390
+ # Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
391
+ if len(df_other) < 3000:
392
+ df_other_oversampled = resample(
393
+ df_other,
394
+ replace=True,
395
+ n_samples=3000,
396
+ random_state=42
397
+ )
398
+ else:
399
+ df_other_oversampled = df_other
400
+
401
+ # Giữ nguyên các lớp khác (hoặc oversample tùy ý)
402
+ df_balanced = pd.concat([
403
+ df_enjoyment,
404
+ df_other_oversampled,
405
+ df_anger,
406
+ df_sadness,
407
+ df_disgust,
408
+ df_fear,
409
+ df_surprise
410
+ ], axis=0)
411
+
412
+ df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
413
+ df = df_balanced
414
+
415
+ print("\nSau khi cân bằng lớp (demo oversample):")
416
+ print(df["Emotion"].value_counts())
417
+
418
+ # Xuất file (nếu muốn)
419
+ df.to_excel(output_path, index=False)
420
+
421
+ # ========== TRAIN BI-LSTM KERAS ==========
422
+
423
+ print("\n========== Training Keras BiLSTM ==========")
424
+
425
+ # Tạo embedding pretrained cho Keras
426
+ pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
427
+ pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
428
+
429
+ # Split data for Keras
430
+ X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
431
+ df, label_column="Emotion", maxlen=400,
432
+ test_size=0.2, for_keras=True
433
+ )
434
+
435
+ num_classes = len(label_mapping)
436
+ input_dim = len(data_manager.vocabulary)
437
+ embedding_dim = pretrained_matrix.shape[1]
438
+ maxlen = 400
439
+
440
+ # Define BiLSTM Model
441
+ def create_bilstm_model():
442
+ input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
443
+ emb_layer = Embedding(
444
+ input_dim=input_dim,
445
+ output_dim=embedding_dim,
446
+ weights=[pretrained_matrix_keras],
447
+ input_length=maxlen,
448
+ trainable=True # Set to False nếu bạn không muốn fine-tune embeddings
449
+ )(input_layer)
450
+
451
+ bilstm = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5))(emb_layer)
452
+ dense1 = Dense(64, activation='relu')(bilstm)
453
+ dropout1 = Dropout(0.5)(dense1)
454
+ dense2 = Dense(32, activation='relu')(dropout1)
455
+ dropout2 = Dropout(0.5)(dense2)
456
+ output_layer = Dense(num_classes, activation='softmax')(dropout2)
457
+
458
+ model = Model(inputs=input_layer, outputs=output_layer)
459
+ model.compile(
460
+ loss='categorical_crossentropy',
461
+ optimizer=Adam(lr=1e-3),
462
+ metrics=['accuracy']
463
+ )
464
+ return model
465
+
466
+ # Create model
467
+ model_bilstm = create_bilstm_model()
468
+ model_bilstm.summary()
469
+
470
+ # Define callbacks
471
+ checkpoint = ModelCheckpoint(
472
+ 'bilstm_best.keras',
473
+ save_best_only=True,
474
+ monitor='val_accuracy',
475
+ mode='max'
476
+ )
477
+ early_stopping = EarlyStopping(
478
+ monitor='val_accuracy',
479
+ patience=5,
480
+ restore_best_weights=True
481
+ )
482
+
483
+ # Train model
484
+ history = model_bilstm.fit(
485
+ X_train, y_train,
486
+ validation_data=(X_test, y_test),
487
+ epochs=100,
488
+ batch_size=32,
489
+ callbacks=[checkpoint, early_stopping]
490
+ )
491
+
492
+ # Đánh giá trên test set với detailed metrics
493
+ loss, acc = model_bilstm.evaluate(X_test, y_test)
494
+ print(f"BiLSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
495
+
496
+ # Thu thập dự đoán và tính toán các chỉ số
497
+ y_pred_bilstm = model_bilstm.predict(X_test)
498
+ y_pred_bilstm = np.argmax(y_pred_bilstm, axis=1)
499
+ y_true_bilstm = np.argmax(y_test, axis=1)
500
+
501
+ test_accuracy_bilstm = accuracy_score(y_true_bilstm, y_pred_bilstm)
502
+ precision_macro_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
503
+ precision_weighted_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
504
+ recall_macro_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
505
+ recall_weighted_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
506
+ f1_macro_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
507
+ f1_weighted_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
508
+ report_bilstm = classification_report(y_true_bilstm, y_pred_bilstm, target_names=label_mapping.keys(), digits=4)
509
+ conf_matrix_bilstm = confusion_matrix(y_true_bilstm, y_pred_bilstm)
510
+
511
+ # In các chỉ số
512
+ print(f"\nBiLSTM Test Accuracy: {test_accuracy_bilstm:.4f}")
513
+ print(f"Precision (Macro): {precision_macro_bilstm:.4f}")
514
+ print(f"Precision (Weighted): {precision_weighted_bilstm:.4f}")
515
+ print(f"Recall (Macro): {recall_macro_bilstm:.4f}")
516
+ print(f"Recall (Weighted): {recall_weighted_bilstm:.4f}")
517
+ print(f"F1-Score (Macro): {f1_macro_bilstm:.4f}")
518
+ print(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}")
519
+
520
+ print("\n========== BiLSTM Classification Report ==========")
521
+ print(report_bilstm)
522
+
523
+ print("\n========== BiLSTM Confusion Matrix ==========")
524
+ print(conf_matrix_bilstm)
525
+
526
+ # Lưu báo cáo vào file
527
+ bilstm_report_dir = "bilstm_emotion_model"
528
+ os.makedirs(bilstm_report_dir, exist_ok=True)
529
+ with open(os.path.join(bilstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
530
+ f.write("========== BiLSTM Classification Report ==========\n")
531
+ f.write(report_bilstm)
532
+ f.write("\n========== Additional Metrics ==========\n")
533
+ f.write(f"Test Loss: {loss:.4f}\n")
534
+ f.write(f"Test Accuracy: {test_accuracy_bilstm:.4f}\n")
535
+ f.write(f"Precision (Macro): {precision_macro_bilstm:.4f}\n")
536
+ f.write(f"Precision (Weighted): {precision_weighted_bilstm:.4f}\n")
537
+ f.write(f"Recall (Macro): {recall_macro_bilstm:.4f}\n")
538
+ f.write(f"Recall (Weighted): {recall_weighted_bilstm:.4f}\n")
539
+ f.write(f"F1-Score (Macro): {f1_macro_bilstm:.4f}\n")
540
+ f.write(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}\n")
541
+ f.write("\n========== Confusion Matrix ==========\n")
542
+ f.write(np.array2string(conf_matrix_bilstm))
543
+
544
+ print("\n========== BiLSTM Classification Report saved to 'bilstm_emotion_model/classification_report.txt' ==========")
545
+
546
+ # Lưu mô hình BiLSTM
547
+ model_bilstm.save(os.path.join(bilstm_report_dir, 'bilstm_model.keras'))
548
+ print(f"========== BiLSTM Model saved to '{bilstm_report_dir}/bilstm_model.keras' ==========")
549
+
550
+ # ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
551
+
552
+ custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
553
+
554
+ # BiLSTM (Keras)
555
+ emotion_bilstm = predict_emotion_bilstm(
556
+ model_bilstm, custom_text, data_manager, label_mapping
557
+ )
558
+ print(f"Predicted Emotion (BiLSTM): {emotion_bilstm}")
559
+
560
+ # Kiểm tra TF, GPU
561
+ print("TF version:", tf.__version__)
562
+ print("GPU devices:", tf.config.list_physical_devices("GPU"))
563
+ # os.system("nvidia-smi") # nếu muốn xem info GPU
564
+
565
+ # ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
566
+ # Lưu label_mapping và vocabulary cho BiLSTM
567
+ with open(os.path.join(bilstm_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
568
+ json.dump(label_mapping, f, ensure_ascii=False, indent=4)
569
+
570
+ with open(os.path.join(bilstm_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
571
+ json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
572
+
573
+ print("========== Label Mapping and Vocabulary saved ==========")
main_RNN_CNN-LSTM.py ADDED
@@ -0,0 +1,738 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # thesis.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import pandas as pd
5
+ import emoji
6
+ import json
7
+ import re
8
+ import numpy as np
9
+ from underthesea import word_tokenize
10
+ from tqdm import tqdm
11
+ import torch
12
+ from torchtext.vocab import Vectors
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.utils import resample
15
+ from sklearn.metrics import (
16
+ accuracy_score,
17
+ classification_report,
18
+ precision_score,
19
+ recall_score,
20
+ f1_score,
21
+ confusion_matrix
22
+ )
23
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
24
+ from torch.utils.data import DataLoader, TensorDataset
25
+ import torch.nn as nn
26
+ import torch.optim as optim
27
+ import tensorflow as tf
28
+ import os
29
+ import joblib
30
+
31
+ # ========== CÁC HÀM TIỀN XỬ LÝ ==========
32
+
33
+ def preprocess_sentence(sentence, abbreviations, emoji_mapping):
34
+ """
35
+ Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
36
+ ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
37
+ """
38
+ sentence = sentence.lower()
39
+ sentence = replace_emojis(sentence, emoji_mapping)
40
+ sentence = remove_profanity(sentence)
41
+ sentence = remove_special_characters(sentence)
42
+ sentence = normalize_whitespace(sentence)
43
+ sentence = replace_abbreviations(sentence, abbreviations)
44
+ sentence = remove_repeated_characters(sentence)
45
+ sentence = replace_numbers(sentence)
46
+ sentence = tokenize_sentence(sentence)
47
+ return sentence
48
+
49
+ def replace_emojis(sentence, emoji_mapping):
50
+ processed_sentence = []
51
+ for char in sentence:
52
+ if char in emoji_mapping:
53
+ processed_sentence.append(emoji_mapping[char])
54
+ elif not emoji.is_emoji(char):
55
+ processed_sentence.append(char)
56
+ return ''.join(processed_sentence)
57
+
58
+ def remove_profanity(sentence):
59
+ profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
60
+ words = sentence.split()
61
+ filtered_words = [word for word in words if word.lower() not in profane_words]
62
+ return ' '.join(filtered_words)
63
+
64
+ def remove_special_characters(sentence):
65
+ return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
66
+
67
+ def normalize_whitespace(sentence):
68
+ return ' '.join(sentence.split())
69
+
70
+ def replace_abbreviations(sentence, abbreviations):
71
+ words = sentence.split()
72
+ replaced_words = [
73
+ " ".join(abbreviations[word]) if word in abbreviations else word
74
+ for word in words
75
+ ]
76
+ return ' '.join(replaced_words)
77
+
78
+ def remove_repeated_characters(sentence):
79
+ # Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
80
+ return re.sub(r"(.)\1{2,}", r"\1", sentence)
81
+
82
+ def replace_numbers(sentence):
83
+ # Thay toàn bộ số bằng token [number]
84
+ return re.sub(r"\d+", "[number]", sentence)
85
+
86
+ def tokenize_sentence(sentence):
87
+ # Tách từ bằng underthesea
88
+ return ' '.join(word_tokenize(sentence))
89
+
90
+
91
+ # ========== VOCABULARY CLASS ==========
92
+
93
+ class Vocabulary:
94
+ def __init__(self):
95
+ self.word2id = {}
96
+ self.word2id['<pad>'] = 0
97
+ self.word2id['<unk>'] = 1
98
+ self.unk_id = 1
99
+ self.id2word = {0: '<pad>', 1: '<unk>'}
100
+
101
+ def __getitem__(self, word):
102
+ return self.word2id.get(word, self.unk_id)
103
+
104
+ def __contains__(self, word):
105
+ return word in self.word2id
106
+
107
+ def __len__(self):
108
+ return len(self.word2id)
109
+
110
+ def lookup_tokens(self, indices):
111
+ return [self.id2word[idx] for idx in indices]
112
+
113
+ def add(self, word):
114
+ if word not in self.word2id:
115
+ idx = len(self.word2id)
116
+ self.word2id[word] = idx
117
+ self.id2word[idx] = word
118
+
119
+ @staticmethod
120
+ def tokenize_corpus(corpus):
121
+ tokenized_corpus = []
122
+ for doc in tqdm(corpus, desc="Tokenizing Corpus"):
123
+ tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
124
+ tokenized_corpus.append(tokens)
125
+ return tokenized_corpus
126
+
127
+ def corpus_to_tensor(self, corpus, is_tokenized=False):
128
+ """
129
+ corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
130
+ return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
131
+ """
132
+ tokenized_corpus = (
133
+ self.tokenize_corpus(corpus) if not is_tokenized else corpus
134
+ )
135
+ return [
136
+ [self[token] for token in doc]
137
+ for doc in tokenized_corpus
138
+ ]
139
+
140
+
141
+ # ========== EMOJI MAPPING ==========
142
+
143
+ emoji_mapping = {
144
+ "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
145
+ "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
146
+ "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
147
+ "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
148
+ "🤑": "[satisfaction]",
149
+ "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
150
+ "😏": "[sarcasm]",
151
+ "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
152
+ "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
153
+ "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
154
+ "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
155
+ "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
156
+ "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
157
+ "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
158
+ "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
159
+ "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
160
+ "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
161
+ }
162
+
163
+ def load_abbreviations(path):
164
+ with open(path, "r", encoding="utf-8") as f:
165
+ return json.load(f)
166
+
167
+
168
+ # ========== DATA MANAGER ==========
169
+
170
+ class DataManager:
171
+ def __init__(self, file_path, abbreviations_path, word2vec_path):
172
+ self.file_path = file_path
173
+ self.abbreviations_path = abbreviations_path
174
+ self.word2vec_path = word2vec_path
175
+ self.vocabulary = None
176
+ self.word_embeddings = None
177
+ self.abbreviations = None
178
+ self.load_abbreviations()
179
+
180
+ def load_abbreviations(self):
181
+ with open(self.abbreviations_path, "r", encoding="utf-8") as f:
182
+ self.abbreviations = json.load(f)
183
+
184
+ def load_word2vec(self):
185
+ """
186
+ Tải vector từ file word2vec,
187
+ dùng torchtext.Vectors để load embedding pretrained.
188
+ """
189
+ self.word_embeddings = Vectors(
190
+ name=self.word2vec_path,
191
+ unk_init=torch.Tensor.normal_
192
+ )
193
+
194
+ def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
195
+ """
196
+ Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
197
+ """
198
+ vocab = Vocabulary()
199
+ from collections import Counter
200
+ counter = Counter()
201
+
202
+ for sent in corpus:
203
+ for token in sent.split():
204
+ counter[token] += 1
205
+
206
+ most_common = counter.most_common(max_vocab_size)
207
+ for word, _freq in most_common:
208
+ vocab.add(word)
209
+
210
+ return vocab
211
+
212
+ def preprocess_data(self):
213
+ df = pd.read_excel(self.file_path)
214
+ if "Sentence" not in df.columns:
215
+ raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
216
+
217
+ # Tiền xử lý từng câu
218
+ df["processed_sentence"] = df["Sentence"].apply(
219
+ lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
220
+ )
221
+
222
+ # Loại những dòng rỗng
223
+ df = df[df["processed_sentence"].str.strip().astype(bool)]
224
+
225
+ # Tạo vocab từ chính dữ liệu
226
+ all_sentences = df["processed_sentence"].tolist()
227
+ self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
228
+
229
+ # Load word2vec
230
+ self.load_word2vec()
231
+
232
+ return df
233
+
234
+ def build_pretrained_embedding_matrix(self, embedding_dim=100):
235
+ """
236
+ Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
237
+ với trọng số pretrained.
238
+ """
239
+ vocab_size = len(self.vocabulary)
240
+ weight_matrix = np.random.normal(
241
+ scale=0.1, size=(vocab_size, embedding_dim)
242
+ ).astype(np.float32)
243
+
244
+ # Copy vector pretrained
245
+ for word, idx in self.vocabulary.word2id.items():
246
+ if word in self.word_embeddings.stoi:
247
+ weight_matrix[idx] = self.word_embeddings.vectors[
248
+ self.word_embeddings.stoi[word]
249
+ ]
250
+
251
+ return weight_matrix
252
+
253
+ def split_and_convert(
254
+ self, df, label_column="Emotion", maxlen=400, test_size=0.2,
255
+ for_keras=False, batch_size=32
256
+ ):
257
+ """
258
+ Chia dữ liệu thành train/test.
259
+ - for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
260
+ - for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
261
+ """
262
+ if label_column not in df.columns:
263
+ raise ValueError(
264
+ f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
265
+ )
266
+
267
+ # Tạo mapping nhãn -> số
268
+ label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
269
+ df[label_column] = df[label_column].map(label_mapping)
270
+ if df[label_column].isnull().any():
271
+ missing = df[df[label_column].isnull()][label_column].unique()
272
+ raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
273
+
274
+ X = df["processed_sentence"].tolist()
275
+ y = df[label_column].tolist()
276
+
277
+ # Stratify to maintain class distribution
278
+ X_train, X_test, y_train, y_test = train_test_split(
279
+ X, y, test_size=test_size, random_state=42, stratify=y
280
+ )
281
+
282
+ # Convert text -> index
283
+ X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
284
+ X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
285
+
286
+ # Pad
287
+ X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
288
+ X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')
289
+
290
+ print(">>> Debug Split and Convert:")
291
+ print("X_train_padded.shape:", X_train_padded.shape)
292
+ print("X_test_padded.shape: ", X_test_padded.shape)
293
+ print("y_train length:", len(y_train))
294
+ print("y_test length: ", len(y_test))
295
+ print("vocab_size:", len(self.vocabulary))
296
+
297
+ if for_keras:
298
+ num_classes = len(label_mapping)
299
+ y_train_onehot = torch.nn.functional.one_hot(
300
+ torch.tensor(y_train),
301
+ num_classes=num_classes
302
+ ).numpy()
303
+ y_test_onehot = torch.nn.functional.one_hot(
304
+ torch.tensor(y_test),
305
+ num_classes=num_classes
306
+ ).numpy()
307
+
308
+ print("y_train_onehot.shape:", y_train_onehot.shape)
309
+ print("y_test_onehot.shape: ", y_test_onehot.shape)
310
+
311
+ return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
312
+ else:
313
+ # Trả về DataLoader
314
+ X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
315
+ X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
316
+ y_train_t = torch.tensor(y_train, dtype=torch.long)
317
+ y_test_t = torch.tensor(y_test, dtype=torch.long)
318
+
319
+ train_ds = TensorDataset(X_train_t, y_train_t)
320
+ test_ds = TensorDataset(X_test_t, y_test_t)
321
+
322
+ train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
323
+ test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
324
+
325
+ return train_loader, test_loader, label_mapping
326
+
327
+
328
+ # ========== MÔ HÌNH PYTORCH RNN ==========
329
+
330
+ class SimpleRNN(nn.Module):
331
+ def __init__(self, pretrained_weight, hidden_dim, output_dim, dropout=0.3):
332
+ super(SimpleRNN, self).__init__()
333
+ vocab_size, embedding_dim = pretrained_weight.shape
334
+ # Tạo nn.Embedding từ pretrained_weight
335
+ self.embedding = nn.Embedding.from_pretrained(
336
+ torch.from_numpy(pretrained_weight),
337
+ freeze=False # True nếu muốn cố định embedding
338
+ )
339
+ self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
340
+ self.dropout = nn.Dropout(dropout)
341
+ self.fc = nn.Linear(hidden_dim, output_dim)
342
+
343
+ def forward(self, x):
344
+ embedded = self.dropout(self.embedding(x))
345
+ _, (hidden, _) = self.rnn(embedded)
346
+ hidden = self.dropout(hidden.squeeze(0))
347
+ output = self.fc(hidden)
348
+ return output
349
+
350
+
351
+ def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
352
+ model.eval()
353
+ with torch.no_grad():
354
+ processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
355
+ tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
356
+ text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
357
+ text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
358
+ text_tensor = torch.tensor(
359
+ text_padded,
360
+ dtype=torch.long
361
+ ).to(device)
362
+
363
+ output = model(text_tensor)
364
+ _, predicted = torch.max(output, 1)
365
+ rev_map = {v: k for k, v in label_mapping.items()}
366
+ return rev_map[predicted.item()]
367
+
368
+
369
+ # ========== MÔ HÌNH KERAS CNN-LSTM ==========
370
+
371
+ def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
372
+ processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
373
+ tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
374
+ text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
375
+ text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
376
+ output = model.predict(text_padded)
377
+ pred = output.argmax(axis=1)[0]
378
+ rev_map = {v: k for k, v in label_mapping.items()}
379
+ return rev_map[pred]
380
+
381
+
382
+ # ========== MAIN ==========
383
+
384
+ if __name__ == "__main__":
385
+ from keras.models import Model
386
+ from keras.layers import (
387
+ Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
388
+ )
389
+ from keras.optimizers import Adam
390
+ from keras.callbacks import ModelCheckpoint, EarlyStopping
391
+
392
+ # -------- ĐƯỜNG DẪN ----------
393
+ file_path = "train.xlsx"
394
+ abbreviations_path = "abbreviations.json"
395
+ word2vec_path = "word2vec_vi_syllables_100dims.txt"
396
+ output_path = "processed.xlsx"
397
+
398
+ # Khởi tạo DataManager
399
+ data_manager = DataManager(
400
+ file_path=file_path,
401
+ abbreviations_path=abbreviations_path,
402
+ word2vec_path=word2vec_path
403
+ )
404
+
405
+ # 1) Tiền xử lý, tạo vocab, load word2vec
406
+ df = data_manager.preprocess_data()
407
+ print("Trước khi cân bằng lớp (undersampling/oversampling):")
408
+ print(df["Emotion"].value_counts())
409
+
410
+ # 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
411
+ # Bạn có thể điều chỉnh theo nhu cầu của mình
412
+ df_enjoyment = df[df["Emotion"] == "Enjoyment"]
413
+ df_other = df[df["Emotion"] == "Other"]
414
+ df_anger = df[df["Emotion"] == "Anger"]
415
+ df_sadness = df[df["Emotion"] == "Sadness"]
416
+ df_disgust = df[df["Emotion"] == "Disgust"]
417
+ df_fear = df[df["Emotion"] == "Fear"]
418
+ df_surprise = df[df["Emotion"] == "Surprise"]
419
+
420
+ # Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
421
+ if len(df_other) < 3000:
422
+ df_other_oversampled = resample(
423
+ df_other,
424
+ replace=True,
425
+ n_samples=3000,
426
+ random_state=42
427
+ )
428
+ else:
429
+ df_other_oversampled = df_other
430
+
431
+ # Giữ nguyên các lớp khác (hoặc oversample tùy ý)
432
+ df_balanced = pd.concat([
433
+ df_enjoyment,
434
+ df_other_oversampled,
435
+ df_anger,
436
+ df_sadness,
437
+ df_disgust,
438
+ df_fear,
439
+ df_surprise
440
+ ], axis=0)
441
+
442
+ df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
443
+ df = df_balanced
444
+
445
+ print("\nSau khi cân bằng lớp (demo oversample):")
446
+ print(df["Emotion"].value_counts())
447
+
448
+ # Xuất file (nếu muốn)
449
+ df.to_excel(output_path, index=False)
450
+
451
+ # ========== TRAIN RNN PYTORCH ==========
452
+
453
+ print("\n========== Training PyTorch SimpleRNN ==========")
454
+
455
+ # Xây ma trận embedding pretrained
456
+ pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
457
+
458
+ # Chia và chuyển đổi dữ liệu thành DataLoader
459
+ train_loader, test_loader, label_mapping = data_manager.split_and_convert(
460
+ df, label_column="Emotion", maxlen=400, test_size=0.2,
461
+ for_keras=False, batch_size=32
462
+ )
463
+
464
+ hidden_dim = 128
465
+ output_dim = len(label_mapping)
466
+
467
+ model_rnn = SimpleRNN(pretrained_weight=pretrained_matrix,
468
+ hidden_dim=hidden_dim,
469
+ output_dim=output_dim,
470
+ dropout=0.3)
471
+ criterion = nn.CrossEntropyLoss()
472
+ optimizer = optim.Adam(model_rnn.parameters(), lr=1e-3)
473
+
474
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
475
+ model_rnn.to(device)
476
+
477
+ num_epochs = 20
478
+ for epoch in range(num_epochs):
479
+ model_rnn.train()
480
+ epoch_loss = 0
481
+ correct = 0
482
+ total = 0
483
+
484
+ for X_batch, y_batch in train_loader:
485
+ X_batch = X_batch.to(device)
486
+ y_batch = y_batch.to(device)
487
+
488
+ optimizer.zero_grad()
489
+ preds = model_rnn(X_batch)
490
+ loss = criterion(preds, y_batch)
491
+ loss.backward()
492
+ optimizer.step()
493
+
494
+ epoch_loss += loss.item()
495
+ _, pred_label = torch.max(preds, 1)
496
+ correct += (pred_label == y_batch).sum().item()
497
+ total += y_batch.size(0)
498
+
499
+ epoch_accuracy = correct / total
500
+ epoch_loss_avg = epoch_loss / len(train_loader)
501
+ print(f"Epoch {epoch+1}/{num_epochs}, "
502
+ f"Loss: {epoch_loss_avg:.4f}, "
503
+ f"Accuracy: {epoch_accuracy:.4f}")
504
+
505
+ # Đánh giá trên test set với detailed metrics
506
+ model_rnn.eval()
507
+ test_loss = 0
508
+ correct = 0
509
+ total = 0
510
+ y_true = []
511
+ y_pred = []
512
+ with torch.no_grad():
513
+ for X_batch, y_batch in test_loader:
514
+ X_batch = X_batch.to(device)
515
+ y_batch = y_batch.to(device)
516
+ preds = model_rnn(X_batch)
517
+ loss = criterion(preds, y_batch)
518
+ test_loss += loss.item()
519
+
520
+ _, predicted = torch.max(preds, 1)
521
+ correct += (predicted == y_batch).sum().item()
522
+ total += y_batch.size(0)
523
+
524
+ y_true.extend(y_batch.cpu().numpy())
525
+ y_pred.extend(predicted.cpu().numpy())
526
+
527
+ test_accuracy = accuracy_score(y_true, y_pred)
528
+ test_loss_avg = test_loss / len(test_loader)
529
+ precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
530
+ precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
531
+ recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
532
+ recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
533
+ f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
534
+ f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
535
+ report = classification_report(y_true, y_pred, target_names=label_mapping.keys(), digits=4)
536
+ conf_matrix = confusion_matrix(y_true, y_pred)
537
+
538
+ # In các chỉ số
539
+ print(f"\nTest Loss: {test_loss_avg:.4f}, Test Accuracy: {test_accuracy:.4f}")
540
+ print(f"Precision (Macro): {precision_macro:.4f}")
541
+ print(f"Precision (Weighted): {precision_weighted:.4f}")
542
+ print(f"Recall (Macro): {recall_macro:.4f}")
543
+ print(f"Recall (Weighted): {recall_weighted:.4f}")
544
+ print(f"F1-Score (Macro): {f1_macro:.4f}")
545
+ print(f"F1-Score (Weighted): {f1_weighted:.4f}")
546
+
547
+ print("\n========== Classification Report ==========")
548
+ print(report)
549
+
550
+ print("\n========== Confusion Matrix ==========")
551
+ print(conf_matrix)
552
+
553
+ # Lưu báo cáo vào file
554
+ rnn_report_dir = "rnn_emotion_model"
555
+ os.makedirs(rnn_report_dir, exist_ok=True)
556
+ with open(os.path.join(rnn_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
557
+ f.write("========== Classification Report ==========\n")
558
+ f.write(report)
559
+ f.write("\n========== Additional Metrics ==========\n")
560
+ f.write(f"Test Loss: {test_loss_avg:.4f}\n")
561
+ f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
562
+ f.write(f"Precision (Macro): {precision_macro:.4f}\n")
563
+ f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
564
+ f.write(f"Recall (Macro): {recall_macro:.4f}\n")
565
+ f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
566
+ f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
567
+ f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
568
+ f.write("\n========== Confusion Matrix ==========\n")
569
+ f.write(np.array2string(conf_matrix))
570
+
571
+ print("\n========== Classification Report saved to 'rnn_emotion_model/classification_report.txt' ==========")
572
+
573
+ # Lưu mô hình RNN
574
+ torch.save(model_rnn.state_dict(), os.path.join(rnn_report_dir, "simple_rnn.pth"))
575
+ print("========== RNN Model saved to 'rnn_emotion_model/simple_rnn.pth' ==========")
576
+
577
+ # ========== TRAIN CNN-LSTM KERAS ==========
578
+
579
+ print("\n========== Training CNN-LSTM (Keras) ==========")
580
+
581
+ # Tạo embedding pretrained cho Keras
582
+ # Chúng ta có pretrained_matrix (num_vocab x 100)
583
+ # Sẽ truyền vào layer Embedding(..., weights=[...])
584
+ X_train_keras, X_test_keras, y_train_keras, y_test_keras, label_mapping_keras = data_manager.split_and_convert(
585
+ df, label_column="Emotion", maxlen=400, test_size=0.2,
586
+ for_keras=True
587
+ )
588
+
589
+ maxlen = 400
590
+ vocab_size, embedding_dim = pretrained_matrix.shape
591
+
592
+ # Chuyển pretrained_matrix -> float32 (đảm bảo Keras nhận dạng)
593
+ pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
594
+
595
+ input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
596
+ emb_layer = Embedding(
597
+ input_dim=vocab_size,
598
+ output_dim=embedding_dim,
599
+ weights=[pretrained_matrix_keras],
600
+ trainable=True # True hoặc False tùy muốn fine-tune embedding
601
+ )(input_layer)
602
+
603
+ def max_1d(X):
604
+ return tf.reduce_max(X, axis=1)
605
+
606
+ con3 = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
607
+ pool_con3 = Lambda(max_1d, output_shape=(150,))(con3)
608
+
609
+ con5 = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
610
+ pool_con5 = Lambda(max_1d, output_shape=(150,))(con5)
611
+
612
+ lstm_out = LSTM(128, dropout=0.3)(emb_layer)
613
+
614
+ merged = concatenate([pool_con3, pool_con5, lstm_out])
615
+ dense = Dense(100, activation='relu')(merged)
616
+ drop = Dropout(0.3)(dense)
617
+ output = Dense(output_dim, activation='softmax')(drop)
618
+
619
+ model_cnn_lstm = Model(inputs=input_layer, outputs=output)
620
+ model_cnn_lstm.compile(
621
+ loss='categorical_crossentropy',
622
+ optimizer=Adam(lr=1e-3),
623
+ metrics=['accuracy']
624
+ )
625
+
626
+ checkpoint = ModelCheckpoint(
627
+ 'cnn_lstm_best.keras',
628
+ save_best_only=True,
629
+ monitor='val_accuracy',
630
+ mode='max'
631
+ )
632
+ early_stopping = EarlyStopping(
633
+ monitor='val_accuracy',
634
+ patience=5,
635
+ restore_best_weights=True
636
+ )
637
+
638
+ history = model_cnn_lstm.fit(
639
+ X_train_keras, y_train_keras,
640
+ validation_data=(X_test_keras, y_test_keras),
641
+ epochs=30,
642
+ batch_size=32,
643
+ callbacks=[checkpoint, early_stopping]
644
+ )
645
+
646
+ # Đánh giá trên test set với detailed metrics
647
+ loss, acc = model_cnn_lstm.evaluate(X_test_keras, y_test_keras)
648
+ print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
649
+
650
+ # Thu thập dự đoán và tính toán các chỉ số
651
+ y_pred_cnn_lstm = model_cnn_lstm.predict(X_test_keras)
652
+ y_pred_cnn_lstm = np.argmax(y_pred_cnn_lstm, axis=1)
653
+ y_true_cnn_lstm = np.argmax(y_test_keras, axis=1)
654
+
655
+ test_accuracy_cnn_lstm = accuracy_score(y_true_cnn_lstm, y_pred_cnn_lstm)
656
+ precision_macro_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
657
+ precision_weighted_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
658
+ recall_macro_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
659
+ recall_weighted_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
660
+ f1_macro_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
661
+ f1_weighted_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
662
+ report_cnn_lstm = classification_report(y_true_cnn_lstm, y_pred_cnn_lstm, target_names=label_mapping.keys(), digits=4)
663
+ conf_matrix_cnn_lstm = confusion_matrix(y_true_cnn_lstm, y_pred_cnn_lstm)
664
+
665
+ # In các chỉ số
666
+ print(f"\nCNN-LSTM Test Accuracy: {test_accuracy_cnn_lstm:.4f}")
667
+ print(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}")
668
+ print(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}")
669
+ print(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}")
670
+ print(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}")
671
+ print(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}")
672
+ print(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}")
673
+
674
+ print("\n========== CNN-LSTM Classification Report ==========")
675
+ print(report_cnn_lstm)
676
+
677
+ print("\n========== CNN-LSTM Confusion Matrix ==========")
678
+ print(conf_matrix_cnn_lstm)
679
+
680
+ # Lưu báo cáo vào file
681
+ cnn_lstm_report_dir = "cnn_lstm_emotion_model"
682
+ os.makedirs(cnn_lstm_report_dir, exist_ok=True)
683
+ with open(os.path.join(cnn_lstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
684
+ f.write("========== CNN-LSTM Classification Report ==========\n")
685
+ f.write(report_cnn_lstm)
686
+ f.write("\n========== Additional Metrics ==========\n")
687
+ f.write(f"Test Loss: {loss:.4f}\n")
688
+ f.write(f"Test Accuracy: {test_accuracy_cnn_lstm:.4f}\n")
689
+ f.write(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}\n")
690
+ f.write(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}\n")
691
+ f.write(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}\n")
692
+ f.write(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}\n")
693
+ f.write(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}\n")
694
+ f.write(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}\n")
695
+ f.write("\n========== Confusion Matrix ==========\n")
696
+ f.write(np.array2string(conf_matrix_cnn_lstm))
697
+
698
+ print("\n========== CNN-LSTM Classification Report saved to 'cnn_lstm_emotion_model/classification_report.txt' ==========")
699
+
700
+ # Lưu mô hình CNN-LSTM
701
+ model_cnn_lstm.save(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
702
+ print(f"========== CNN-LSTM Model saved to '{cnn_lstm_report_dir}/cnn_lstm_model.keras' ==========")
703
+
704
+ # ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
705
+ # Lưu label_mapping và vocabulary cho RNN
706
+ with open(os.path.join(rnn_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
707
+ json.dump(label_mapping, f, ensure_ascii=False, indent=4)
708
+
709
+ with open(os.path.join(rnn_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
710
+ json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
711
+
712
+ # Lưu label_mapping và vocabulary cho CNN-LSTM
713
+ # Giả sử label_mapping và vocabulary giống nhau, bạn có thể chỉ lưu một lần.
714
+ # Nếu khác, hãy điều chỉnh tương ứng.
715
+
716
+ print("========== Label Mapping and Vocabulary saved ==========")
717
+
718
+ # ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
719
+
720
+ custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
721
+
722
+ # RNN (PyTorch)
723
+ emotion_rnn = predict_emotion_rnn(
724
+ model_rnn, custom_text, data_manager, label_mapping, device
725
+ )
726
+ print(f"Predicted Emotion (RNN): {emotion_rnn}")
727
+
728
+ # CNN-LSTM (Keras)
729
+ cnn_lstm_loaded = tf.keras.models.load_model(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
730
+ emotion_cnn_lstm = predict_emotion_cnn_lstm(
731
+ cnn_lstm_loaded, custom_text, data_manager, label_mapping
732
+ )
733
+ print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
734
+
735
+ # Kiểm tra TF, GPU
736
+ print("TF version:", tf.__version__)
737
+ print("GPU devices:", tf.config.list_physical_devices("GPU"))
738
+ # os.system("nvidia-smi") # nếu muốn xem info GPU
main_lstm.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lstm_emotion_classifier.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re
5
+ import emoji
6
+ import json
7
+ import pandas as pd
8
+ import numpy as np
9
+ import tensorflow as tf
10
+ from underthesea import word_tokenize
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
13
+ from sklearn.utils import resample
14
+ from tensorflow.keras.preprocessing.text import Tokenizer
15
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
16
+ from tensorflow.keras.models import Sequential
17
+ from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
18
+ from tensorflow.keras.callbacks import EarlyStopping
19
+ import joblib
20
+ import os
21
+ import matplotlib.pyplot as plt
22
+ import seaborn as sns
23
+
24
+ ########################
25
+ # TIỀN XỬ LÝ
26
+ ########################
27
+
28
+ def replace_emojis(sentence, emoji_mapping):
29
+ processed_sentence = []
30
+ for char in sentence:
31
+ if char in emoji_mapping:
32
+ processed_sentence.append(emoji_mapping[char])
33
+ elif not emoji.is_emoji(char):
34
+ processed_sentence.append(char)
35
+ return ''.join(processed_sentence)
36
+
37
+ def remove_profanity(sentence):
38
+ profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
39
+ words = sentence.split()
40
+ filtered = [w for w in words if w.lower() not in profane_words]
41
+ return ' '.join(filtered)
42
+
43
+ def remove_special_characters(sentence):
44
+ return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
45
+
46
+ def normalize_whitespace(sentence):
47
+ return ' '.join(sentence.split())
48
+
49
+ def remove_repeated_characters(sentence):
50
+ return re.sub(r"(.)\1{2,}", r"\1", sentence)
51
+
52
+ def replace_numbers(sentence):
53
+ return re.sub(r"\d+", "[number]", sentence)
54
+
55
+ def tokenize_underthesea(sentence):
56
+ tokens = word_tokenize(sentence)
57
+ return " ".join(tokens)
58
+
59
+ def preprocess_sentence(sentence, abbreviations, emoji_mapping):
60
+ sentence = sentence.lower()
61
+ sentence = replace_emojis(sentence, emoji_mapping)
62
+ sentence = remove_profanity(sentence)
63
+ sentence = remove_special_characters(sentence)
64
+ sentence = normalize_whitespace(sentence)
65
+ # Thay thế viết tắt
66
+ words = sentence.split()
67
+ replaced = []
68
+ for w in words:
69
+ if w in abbreviations:
70
+ replaced.append(" ".join(abbreviations[w]))
71
+ else:
72
+ replaced.append(w)
73
+ sentence = " ".join(replaced)
74
+ sentence = remove_repeated_characters(sentence)
75
+ sentence = replace_numbers(sentence)
76
+ # Tokenize tiếng Việt
77
+ sentence = tokenize_underthesea(sentence)
78
+ return sentence
79
+
80
+ emoji_mapping = {
81
+ "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
82
+ "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
83
+ "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
84
+ "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
85
+ "🤑": "[satisfaction]",
86
+ "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
87
+ "😏": "[sarcasm]",
88
+ "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
89
+ "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
90
+ "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
91
+ "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
92
+ "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
93
+ "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
94
+ "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
95
+ "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
96
+ "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
97
+ "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
98
+ }
99
+
100
+ def load_abbreviations(path):
101
+ with open(path, "r", encoding="utf-8") as f:
102
+ return json.load(f)
103
+
104
+ ###################################
105
+ # MAIN
106
+ ###################################
107
+ if __name__ == "__main__":
108
+ file_path = "train.xlsx"
109
+ abbreviations_path = "abbreviations.json"
110
+ output_path = "processed_phobert.xlsx"
111
+
112
+ abbreviations = load_abbreviations(abbreviations_path)
113
+
114
+ df = pd.read_excel(file_path)
115
+ if "Sentence" not in df.columns or "Emotion" not in df.columns:
116
+ raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
117
+
118
+ # Tiền xử lý
119
+ df["processed_sentence"] = df["Sentence"].apply(
120
+ lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
121
+ )
122
+ # Loại bỏ rỗng
123
+ df = df[df["processed_sentence"].str.strip().astype(bool)]
124
+
125
+ print("Trước khi cân bằng:")
126
+ print(df["Emotion"].value_counts())
127
+
128
+ # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
129
+ # Lấy max samples
130
+ max_count = df["Emotion"].value_counts().max()
131
+
132
+ df_balanced_list = []
133
+ for emo in df["Emotion"].unique():
134
+ df_emo = df[df["Emotion"] == emo]
135
+ if len(df_emo) < max_count:
136
+ # Oversample lên max_count
137
+ df_emo_oversampled = resample(
138
+ df_emo,
139
+ replace=True,
140
+ n_samples=max_count,
141
+ random_state=42
142
+ )
143
+ df_balanced_list.append(df_emo_oversampled)
144
+ else:
145
+ # Nếu emo này = max_count rồi thì giữ nguyên
146
+ df_balanced_list.append(df_emo)
147
+
148
+ df = pd.concat(df_balanced_list, axis=0)
149
+ df = df.sample(frac=1, random_state=42).reset_index(drop=True)
150
+
151
+ print("\nSau khi cân bằng tất cả lớp:")
152
+ print(df["Emotion"].value_counts())
153
+
154
+ df.to_excel(output_path, index=False)
155
+
156
+ # Tạo label2id và id2label theo thứ tự bạn cung cấp
157
+ custom_id2label = {
158
+ 0: 'Anger',
159
+ 1: 'Disgust',
160
+ 2: 'Enjoyment',
161
+ 3: 'Fear',
162
+ 4: 'Other',
163
+ 5: 'Sadness',
164
+ 6: 'Surprise'
165
+ }
166
+ label2id = {label: idx for idx, label in enumerate(custom_id2label.values())}
167
+ id2label = {v: k for k, v in label2id.items()}
168
+
169
+ df["label_id"] = df["Emotion"].map(label2id)
170
+
171
+ # Tách train/test
172
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
173
+ print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
174
+
175
+ # Feature Extraction với Tokenizer và Padding
176
+ tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
177
+ tokenizer.fit_on_texts(train_df["processed_sentence"])
178
+
179
+ X_train_seq = tokenizer.texts_to_sequences(train_df["processed_sentence"])
180
+ X_test_seq = tokenizer.texts_to_sequences(test_df["processed_sentence"])
181
+
182
+ max_length = 256
183
+ X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
184
+ X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
185
+
186
+ y_train = train_df["label_id"].values
187
+ y_test = test_df["label_id"].values
188
+
189
+ # Chuyển đổi nhãn thành one-hot encoding
190
+ num_classes = len(custom_id2label)
191
+ y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
192
+ y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
193
+
194
+ # Xây dựng mô hình LSTM
195
+ model = Sequential([
196
+ Embedding(input_dim=5000, output_dim=128, input_length=max_length),
197
+ LSTM(128, dropout=0.2, recurrent_dropout=0.2),
198
+ Dense(64, activation='relu'),
199
+ Dropout(0.5),
200
+ Dense(num_classes, activation='softmax')
201
+ ])
202
+
203
+ model.compile(loss='categorical_crossentropy',
204
+ optimizer='adam',
205
+ metrics=['accuracy'])
206
+
207
+ model.summary()
208
+
209
+ # Huấn luyện mô hình
210
+ early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
211
+
212
+ history = model.fit(
213
+ X_train, y_train,
214
+ epochs=10,
215
+ batch_size=32,
216
+ validation_data=(X_test, y_test),
217
+ callbacks=[early_stop],
218
+ verbose=1
219
+ )
220
+
221
+ # Đánh giá mô hình
222
+ print("\n========== Evaluate on Test set ==========")
223
+ loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
224
+ print(f"Test Accuracy: {accuracy:.4f}")
225
+
226
+ # Dự đoán và in báo cáo phân loại
227
+ y_pred_probs = model.predict(X_test)
228
+ y_pred = np.argmax(y_pred_probs, axis=1)
229
+ y_true = np.argmax(y_test, axis=1)
230
+
231
+ # In Classification Report
232
+ print("\nClassification Report:")
233
+ report = classification_report(y_true, y_pred, target_names=custom_id2label.values())
234
+ print(report)
235
+
236
+ # Tính và in Confusion Matrix
237
+ conf_matrix = confusion_matrix(y_true, y_pred)
238
+ print("\nConfusion Matrix:")
239
+ print(conf_matrix)
240
+
241
+ # Vẽ Confusion Matrix
242
+ plt.figure(figsize=(10, 8))
243
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
244
+ xticklabels=custom_id2label.values(),
245
+ yticklabels=custom_id2label.values())
246
+ plt.ylabel('Actual')
247
+ plt.xlabel('Predicted')
248
+ plt.title('Confusion Matrix')
249
+ plt.tight_layout()
250
+ plt.savefig(os.path.join("lstm_emotion_model", "confusion_matrix.png"))
251
+ plt.close()
252
+ print("\nConfusion Matrix plot saved to 'lstm_emotion_model/confusion_matrix.png'")
253
+
254
+ # Lưu Classification Report vào file
255
+ report_path = os.path.join("lstm_emotion_model", "classification_report.txt")
256
+ with open(report_path, "w", encoding="utf-8") as f:
257
+ f.write("========== Classification Report ==========\n")
258
+ f.write(report)
259
+ f.write("\n========== Confusion Matrix ==========\n")
260
+ f.write(np.array2string(conf_matrix))
261
+
262
+ print(f"\nClassification Report saved to '{report_path}'")
263
+
264
+ # Lưu mô hình và tokenizer
265
+ model_output_dir = "./lstm_emotion_model"
266
+ os.makedirs(model_output_dir, exist_ok=True)
267
+ model.save(os.path.join(model_output_dir, "lstm_emotion_model.h5"))
268
+ joblib.dump(tokenizer, os.path.join(model_output_dir, "tokenizer.joblib"))
269
+ with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
270
+ json.dump(id2label, f, ensure_ascii=False, indent=4)
271
+
272
+ print("\n========== Model and Tokenizer saved ==========")
273
+
274
+ # Predict 1 câu (ví dụ)
275
+ def predict_text(text):
276
+ text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
277
+ seq = tokenizer.texts_to_sequences([text_proc])
278
+ padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
279
+ pred_prob = model.predict(padded)
280
+ pred_id = np.argmax(pred_prob, axis=1)[0]
281
+ label = custom_id2label[pred_id]
282
+ return label
283
+
284
+ custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
285
+ emotion_pred = predict_text(custom_text)
286
+ print("\nCâu ví dụ:", custom_text)
287
+ print("Dự đoán cảm xúc:", emotion_pred)
288
+
289
+ print("\nHoàn thành demo LSTM với cân bằng dữ liệu & nhiều epoch hơn!")
main_phobert.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # phobert_emotion_balanced.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re
5
+ import emoji
6
+ import json
7
+ import pandas as pd
8
+ import torch
9
+ import numpy as np
10
+ import os
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+
14
+ from transformers import (
15
+ AutoTokenizer,
16
+ AutoConfig,
17
+ AutoModelForSequenceClassification,
18
+ Trainer,
19
+ TrainingArguments
20
+ )
21
+
22
+ from sklearn.model_selection import train_test_split
23
+ from sklearn.utils import resample
24
+ from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
25
+
26
+ ########################
27
+ # TIỀN XỬ LÝ
28
+ ########################
29
+
30
+ def replace_emojis(sentence, emoji_mapping):
31
+ processed_sentence = []
32
+ for char in sentence:
33
+ if char in emoji_mapping:
34
+ processed_sentence.append(emoji_mapping[char])
35
+ elif not emoji.is_emoji(char):
36
+ processed_sentence.append(char)
37
+ return ''.join(processed_sentence)
38
+
39
+ def remove_profanity(sentence):
40
+ profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
41
+ words = sentence.split()
42
+ filtered = [w for w in words if w.lower() not in profane_words]
43
+ return ' '.join(filtered)
44
+
45
+ def remove_special_characters(sentence):
46
+ return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
47
+
48
+ def normalize_whitespace(sentence):
49
+ return ' '.join(sentence.split())
50
+
51
+ def remove_repeated_characters(sentence):
52
+ return re.sub(r"(.)\1{2,}", r"\1", sentence)
53
+
54
+ def replace_numbers(sentence):
55
+ return re.sub(r"\d+", "[number]", sentence)
56
+
57
+ def tokenize_underthesea(sentence):
58
+ from underthesea import word_tokenize
59
+ tokens = word_tokenize(sentence)
60
+ return " ".join(tokens)
61
+
62
+ def preprocess_sentence(sentence, abbreviations, emoji_mapping):
63
+ sentence = sentence.lower()
64
+ sentence = replace_emojis(sentence, emoji_mapping)
65
+ sentence = remove_profanity(sentence)
66
+ sentence = remove_special_characters(sentence)
67
+ sentence = normalize_whitespace(sentence)
68
+ # Thay thế viết tắt
69
+ words = sentence.split()
70
+ replaced = []
71
+ for w in words:
72
+ if w in abbreviations:
73
+ replaced.append(" ".join(abbreviations[w]))
74
+ else:
75
+ replaced.append(w)
76
+ sentence = " ".join(replaced)
77
+ sentence = remove_repeated_characters(sentence)
78
+ sentence = replace_numbers(sentence)
79
+ # Tokenize
80
+ sentence = tokenize_underthesea(sentence)
81
+ return sentence
82
+
83
+ emoji_mapping = {
84
+ "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
85
+ "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
86
+ "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
87
+ "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
88
+ "🤑": "[satisfaction]",
89
+ "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
90
+ "😏": "[sarcasm]",
91
+ "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
92
+ "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
93
+ "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
94
+ "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
95
+ "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
96
+ "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
97
+ "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
98
+ "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
99
+ "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
100
+ "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
101
+ }
102
+
103
+ def load_abbreviations(path):
104
+ with open(path, "r", encoding="utf-8") as f:
105
+ return json.load(f)
106
+
107
+ # Dataset HF
108
+ class PhoBertEmotionDataset(torch.utils.data.Dataset):
109
+ def __init__(self, encodings, labels):
110
+ self.encodings = encodings
111
+ self.labels = labels
112
+
113
+ def __len__(self):
114
+ return len(self.labels)
115
+
116
+ def __getitem__(self, idx):
117
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
118
+ item["labels"] = torch.tensor(self.labels[idx])
119
+ return item
120
+
121
+ ###################################
122
+ # MAIN
123
+ ###################################
124
+ if __name__ == "__main__":
125
+ file_path = "train.xlsx"
126
+ abbreviations_path = "abbreviations.json"
127
+ output_path = "processed_phobert.xlsx"
128
+
129
+ abbreviations = load_abbreviations(abbreviations_path)
130
+
131
+ df = pd.read_excel(file_path)
132
+ if "Sentence" not in df.columns or "Emotion" not in df.columns:
133
+ raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
134
+
135
+ # Tiền xử lý
136
+ df["processed_sentence"] = df["Sentence"].apply(
137
+ lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
138
+ )
139
+ # Loại bỏ rỗng
140
+ df = df[df["processed_sentence"].str.strip().astype(bool)]
141
+
142
+ print("Trước khi cân bằng:")
143
+ print(df["Emotion"].value_counts())
144
+
145
+ # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
146
+ # Lấy max samples
147
+ max_count = df["Emotion"].value_counts().max()
148
+
149
+ df_balanced_list = []
150
+ for emo in df["Emotion"].unique():
151
+ df_emo = df[df["Emotion"] == emo]
152
+ if len(df_emo) < max_count:
153
+ # Oversample lên max_count
154
+ df_emo_oversampled = resample(
155
+ df_emo,
156
+ replace=True,
157
+ n_samples=max_count,
158
+ random_state=42
159
+ )
160
+ df_balanced_list.append(df_emo_oversampled)
161
+ else:
162
+ # Nếu emo này = max_count rồi thì giữ nguyên
163
+ df_balanced_list.append(df_emo)
164
+
165
+ df = pd.concat(df_balanced_list, axis=0)
166
+ df = df.sample(frac=1, random_state=42).reset_index(drop=True)
167
+
168
+ print("\nSau khi cân bằng tất cả lớp:")
169
+ print(df["Emotion"].value_counts())
170
+
171
+ df.to_excel(output_path, index=False)
172
+
173
+ # Tạo label2id
174
+ unique_labels = sorted(df["Emotion"].unique()) # Sắp xếp để cố định
175
+ label2id = {label: i for i, label in enumerate(unique_labels)}
176
+ id2label = {v: k for k, v in label2id.items()}
177
+
178
+ df["label_id"] = df["Emotion"].map(label2id)
179
+
180
+ # Tách train/test
181
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
182
+
183
+ print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
184
+
185
+ # Load tokenizer
186
+ checkpoint = "vinai/phobert-base"
187
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
188
+
189
+ def tokenize_texts(texts):
190
+ return tokenizer(
191
+ texts,
192
+ padding=True,
193
+ truncation=True,
194
+ max_length=256
195
+ )
196
+
197
+ train_texts = train_df["processed_sentence"].tolist()
198
+ train_labels = train_df["label_id"].tolist()
199
+ test_texts = test_df["processed_sentence"].tolist()
200
+ test_labels = test_df["label_id"].tolist()
201
+
202
+ train_encodings = tokenize_texts(train_texts)
203
+ test_encodings = tokenize_texts(test_texts)
204
+
205
+ train_dataset = PhoBertEmotionDataset(train_encodings, train_labels)
206
+ test_dataset = PhoBertEmotionDataset(test_encodings, test_labels)
207
+
208
+ # Load model
209
+ config = AutoConfig.from_pretrained(checkpoint)
210
+ config.num_labels = len(label2id)
211
+ model = AutoModelForSequenceClassification.from_pretrained(
212
+ checkpoint,
213
+ config=config
214
+ )
215
+
216
+ # Tăng epoch lên 10, LR=2e-5
217
+ training_args = TrainingArguments(
218
+ output_dir="./phobert_results_v2",
219
+ overwrite_output_dir=True,
220
+ do_train=True,
221
+ do_eval=True,
222
+ evaluation_strategy="epoch",
223
+ save_strategy="epoch",
224
+ num_train_epochs=10, # Tăng epoch
225
+ per_device_train_batch_size=16,
226
+ per_device_eval_batch_size=16,
227
+ learning_rate=2e-5,
228
+ logging_dir="./logs",
229
+ logging_steps=50,
230
+ load_best_model_at_end=True,
231
+ metric_for_best_model="f1_weighted", # Chọn metric để lưu model tốt nhất
232
+ greater_is_better=True,
233
+ seed=42
234
+ )
235
+
236
+ # Define compute_metrics with additional metrics
237
+ def compute_metrics(eval_pred):
238
+ logits, labels = eval_pred
239
+ preds = np.argmax(logits, axis=-1)
240
+ precision_weighted = precision_score(labels, preds, average='weighted', zero_division=0)
241
+ recall_weighted = recall_score(labels, preds, average='weighted', zero_division=0)
242
+ f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0)
243
+ precision_macro = precision_score(labels, preds, average='macro', zero_division=0)
244
+ recall_macro = recall_score(labels, preds, average='macro', zero_division=0)
245
+ f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
246
+ accuracy = accuracy_score(labels, preds)
247
+ return {
248
+ "accuracy": accuracy,
249
+ "precision_weighted": precision_weighted,
250
+ "recall_weighted": recall_weighted,
251
+ "f1_weighted": f1_weighted,
252
+ "precision_macro": precision_macro,
253
+ "recall_macro": recall_macro,
254
+ "f1_macro": f1_macro
255
+ }
256
+
257
+ trainer = Trainer(
258
+ model=model,
259
+ args=training_args,
260
+ train_dataset=train_dataset,
261
+ eval_dataset=test_dataset,
262
+ tokenizer=tokenizer,
263
+ compute_metrics=compute_metrics
264
+ )
265
+
266
+ print("\n========== Training PhoBERT (balanced, more epochs) ==========")
267
+ trainer.train()
268
+
269
+ print("\n========== Evaluate on Test set ==========")
270
+ results = trainer.evaluate(test_dataset)
271
+ print("Test results:", results)
272
+
273
+ # Extract additional metrics
274
+ print("\n========== Additional Metrics ==========")
275
+ print(f"Test Loss: {results.get('eval_loss'):.4f}")
276
+ print(f"Test Accuracy: {results.get('eval_accuracy'):.4f}")
277
+ print(f"Precision (Macro): {results.get('eval_precision_macro'):.4f}")
278
+ print(f"Precision (Weighted): {results.get('eval_precision_weighted'):.4f}")
279
+ print(f"Recall (Macro): {results.get('eval_recall_macro'):.4f}")
280
+ print(f"Recall (Weighted): {results.get('eval_recall_weighted'):.4f}")
281
+ print(f"F1-Score (Macro): {results.get('eval_f1_macro'):.4f}")
282
+ print(f"F1-Score (Weighted): {results.get('eval_f1_weighted'):.4f}")
283
+
284
+ # Generate detailed classification report
285
+ print("\n========== Detailed Classification Report ==========")
286
+ predictions, labels, _ = trainer.predict(test_dataset)
287
+ preds = np.argmax(predictions, axis=1)
288
+ report = classification_report(labels, preds, target_names=unique_labels, digits=4)
289
+ print(report)
290
+
291
+ # Tính Confusion Matrix
292
+ conf_matrix = confusion_matrix(labels, preds)
293
+ print("\nConfusion Matrix:")
294
+ print(conf_matrix)
295
+
296
+ # Vẽ Confusion Matrix
297
+ plt.figure(figsize=(10, 8))
298
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
299
+ xticklabels=unique_labels,
300
+ yticklabels=unique_labels)
301
+ plt.ylabel('Actual')
302
+ plt.xlabel('Predicted')
303
+ plt.title('Confusion Matrix')
304
+ plt.tight_layout()
305
+ confusion_matrix_path = os.path.join("phobert_emotion_model", "confusion_matrix.png")
306
+ os.makedirs("phobert_emotion_model", exist_ok=True)
307
+ plt.savefig(confusion_matrix_path)
308
+ plt.close()
309
+ print(f"\nConfusion Matrix plot saved to '{confusion_matrix_path}'")
310
+
311
+ # Lưu Classification Report vào file
312
+ report_path = os.path.join("phobert_emotion_model", "classification_report.txt")
313
+ with open(report_path, "w", encoding="utf-8") as f:
314
+ f.write("========== Classification Report ==========\n")
315
+ f.write(report)
316
+ f.write("\n========== Confusion Matrix ==========\n")
317
+ f.write(np.array2string(conf_matrix))
318
+
319
+ print(f"\nClassification Report saved to '{report_path}'")
320
+
321
+ # Lưu mô hình và tokenizer
322
+ model_output_dir = "./phobert_emotion_model"
323
+ os.makedirs(model_output_dir, exist_ok=True)
324
+ model.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
325
+ tokenizer.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
326
+ with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
327
+ json.dump(id2label, f, ensure_ascii=False, indent=4)
328
+
329
+ print("\n========== Model and Tokenizer saved ==========")
330
+
331
+ # Predict 1 câu (ví dụ)
332
+ device = "cuda" if torch.cuda.is_available() else "cpu"
333
+ model.to(device)
334
+
335
+ def predict_text(text):
336
+ text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
337
+ enc = tokenizer(text_proc, padding=True, truncation=True, max_length=256, return_tensors="pt")
338
+ enc = {k: v.to(device) for k, v in enc.items()}
339
+ with torch.no_grad():
340
+ out = model(**enc)
341
+ pred_id = out.logits.argmax(dim=-1).item()
342
+ return id2label[pred_id]
343
+
344
+ custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
345
+ emotion_pred = predict_text(custom_text)
346
+ print("\nCâu ví dụ:", custom_text)
347
+ print("Dự đoán cảm xúc:", emotion_pred)
348
+
349
+ print("\nHoàn thành demo PhoBERT với cân bằng dữ liệu & nhiều epoch hơn!")
main_svm.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # svm_emotion_classifier.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import re
5
+ import emoji
6
+ import json
7
+ import pandas as pd
8
+ import numpy as np
9
+ import torch # Có thể không cần thiết cho SVM, nhưng giữ lại nếu cần
10
+ from underthesea import word_tokenize
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.svm import SVC
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.metrics import (
15
+ accuracy_score,
16
+ classification_report,
17
+ precision_score,
18
+ recall_score,
19
+ f1_score,
20
+ confusion_matrix
21
+ )
22
+ from sklearn.utils import resample
23
+ import joblib
24
+ import os
25
+
26
+ ########################
27
+ # TIỀN XỬ LÝ
28
+ ########################
29
+
30
+ def replace_emojis(sentence, emoji_mapping):
31
+ processed_sentence = []
32
+ for char in sentence:
33
+ if char in emoji_mapping:
34
+ processed_sentence.append(emoji_mapping[char])
35
+ elif not emoji.is_emoji(char):
36
+ processed_sentence.append(char)
37
+ return ''.join(processed_sentence)
38
+
39
+ def remove_profanity(sentence):
40
+ profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
41
+ words = sentence.split()
42
+ filtered = [w for w in words if w.lower() not in profane_words]
43
+ return ' '.join(filtered)
44
+
45
+ def remove_special_characters(sentence):
46
+ return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
47
+
48
+ def normalize_whitespace(sentence):
49
+ return ' '.join(sentence.split())
50
+
51
+ def remove_repeated_characters(sentence):
52
+ return re.sub(r"(.)\1{2,}", r"\1", sentence)
53
+
54
+ def replace_numbers(sentence):
55
+ return re.sub(r"\d+", "[number]", sentence)
56
+
57
+ def tokenize_underthesea(sentence):
58
+ tokens = word_tokenize(sentence)
59
+ return " ".join(tokens)
60
+
61
+ def preprocess_sentence(sentence, abbreviations, emoji_mapping):
62
+ sentence = sentence.lower()
63
+ sentence = replace_emojis(sentence, emoji_mapping)
64
+ sentence = remove_profanity(sentence)
65
+ sentence = remove_special_characters(sentence)
66
+ sentence = normalize_whitespace(sentence)
67
+ # Thay thế viết tắt
68
+ words = sentence.split()
69
+ replaced = []
70
+ for w in words:
71
+ if w in abbreviations:
72
+ replaced.append(" ".join(abbreviations[w]))
73
+ else:
74
+ replaced.append(w)
75
+ sentence = " ".join(replaced)
76
+ sentence = remove_repeated_characters(sentence)
77
+ sentence = replace_numbers(sentence)
78
+ # Tokenize tiếng Việt
79
+ sentence = tokenize_underthesea(sentence)
80
+ return sentence
81
+
82
+ emoji_mapping = {
83
+ "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
84
+ "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
85
+ "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
86
+ "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
87
+ "🤑": "[satisfaction]",
88
+ "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
89
+ "😏": "[sarcasm]",
90
+ "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
91
+ "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
92
+ "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
93
+ "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
94
+ "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
95
+ "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
96
+ "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
97
+ "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
98
+ "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
99
+ "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
100
+ }
101
+
102
+ def load_abbreviations(path):
103
+ with open(path, "r", encoding="utf-8") as f:
104
+ return json.load(f)
105
+
106
+ ###################################
107
+ # MAIN
108
+ ###################################
109
+ if __name__ == "__main__":
110
+ file_path = "train.xlsx"
111
+ abbreviations_path = "abbreviations.json"
112
+ output_path = "processed_svm.xlsx" # Changed output filename to reflect SVM
113
+
114
+ abbreviations = load_abbreviations(abbreviations_path)
115
+
116
+ df = pd.read_excel(file_path)
117
+ if "Sentence" not in df.columns or "Emotion" not in df.columns:
118
+ raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
119
+
120
+ # Tiền xử lý
121
+ df["processed_sentence"] = df["Sentence"].apply(
122
+ lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
123
+ )
124
+ # Loại bỏ rỗng
125
+ df = df[df["processed_sentence"].str.strip().astype(bool)]
126
+
127
+ print("Trước khi cân bằng:")
128
+ print(df["Emotion"].value_counts())
129
+
130
+ # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
131
+ # Lấy max samples
132
+ max_count = df["Emotion"].value_counts().max()
133
+
134
+ df_balanced_list = []
135
+ for emo in df["Emotion"].unique():
136
+ df_emo = df[df["Emotion"] == emo]
137
+ if len(df_emo) < max_count:
138
+ # Oversample lên max_count
139
+ df_emo_oversampled = resample(
140
+ df_emo,
141
+ replace=True,
142
+ n_samples=max_count,
143
+ random_state=42
144
+ )
145
+ df_balanced_list.append(df_emo_oversampled)
146
+ else:
147
+ # Nếu emo này = max_count rồi thì giữ nguyên
148
+ df_balanced_list.append(df_emo)
149
+
150
+ df = pd.concat(df_balanced_list, axis=0)
151
+ df = df.sample(frac=1, random_state=42).reset_index(drop=True)
152
+
153
+ print("\nSau khi cân bằng tất cả lớp:")
154
+ print(df["Emotion"].value_counts())
155
+
156
+ df.to_excel(output_path, index=False)
157
+
158
+ # Tạo label2id và id2label theo thứ tự bạn cung cấp
159
+ custom_id2label = {
160
+ 0: 'Anger',
161
+ 1: 'Disgust',
162
+ 2: 'Enjoyment',
163
+ 3: 'Fear',
164
+ 4: 'Other',
165
+ 5: 'Sadness',
166
+ 6: 'Surprise'
167
+ }
168
+ label2id = {label: idx for idx, label in custom_id2label.items()}
169
+ id2label = {v: k for k, v in label2id.items()}
170
+
171
+ df["label_id"] = df["Emotion"].map(label2id)
172
+ if df["label_id"].isnull().any():
173
+ missing = df[df["label_id"].isnull()]["Emotion"].unique()
174
+ raise ValueError(f"Những nhãn cảm xúc sau không có trong label2id: {missing}")
175
+
176
+ # Tách train/test
177
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
178
+
179
+ print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
180
+
181
+ # Feature Extraction với TF-IDF
182
+ vectorizer = TfidfVectorizer(max_features=5000)
183
+ X_train = vectorizer.fit_transform(train_df["processed_sentence"])
184
+ X_test = vectorizer.transform(test_df["processed_sentence"])
185
+ y_train = train_df["label_id"].values
186
+ y_test = test_df["label_id"].values
187
+
188
+ # Huấn luyện mô hình SVM
189
+ svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
190
+ print("\n========== Training SVM ==========")
191
+ svm_classifier.fit(X_train, y_train)
192
+
193
+ # Đánh giá mô hình
194
+ print("\n========== Evaluate on Test set ==========")
195
+ y_pred = svm_classifier.predict(X_test)
196
+
197
+ # Tính các chỉ số
198
+ accuracy = accuracy_score(y_test, y_pred)
199
+ precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
200
+ precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
201
+ recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
202
+ recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
203
+ f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
204
+ f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
205
+ conf_matrix = confusion_matrix(y_test, y_pred)
206
+
207
+ # In các chỉ số
208
+ print(f"Test Accuracy: {accuracy:.4f}")
209
+ print(f"Precision (Macro): {precision_macro:.4f}")
210
+ print(f"Precision (Weighted): {precision_weighted:.4f}")
211
+ print(f"Recall (Macro): {recall_macro:.4f}")
212
+ print(f"Recall (Weighted): {recall_weighted:.4f}")
213
+ print(f"F1-Score (Macro): {f1_macro:.4f}")
214
+ print(f"F1-Score (Weighted): {f1_weighted:.4f}")
215
+
216
+ print("\n========== Classification Report ==========")
217
+ report = classification_report(y_test, y_pred, target_names=custom_id2label.values(), digits=4)
218
+ print(report)
219
+
220
+ # Lưu báo cáo vào file
221
+ report_path = os.path.join("svm_emotion_model", "classification_report.txt")
222
+ os.makedirs(os.path.dirname(report_path), exist_ok=True)
223
+ with open(report_path, "w", encoding="utf-8") as f:
224
+ f.write("========== Classification Report ==========\n")
225
+ f.write(report)
226
+ f.write("\n========== Additional Metrics ==========\n")
227
+ f.write(f"Accuracy: {accuracy:.4f}\n")
228
+ f.write(f"Precision (Macro): {precision_macro:.4f}\n")
229
+ f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
230
+ f.write(f"Recall (Macro): {recall_macro:.4f}\n")
231
+ f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
232
+ f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
233
+ f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
234
+ f.write("\n========== Confusion Matrix ==========\n")
235
+ f.write(np.array2string(conf_matrix))
236
+
237
+ print("\n========== Classification Report saved to 'svm_emotion_model/classification_report.txt' ==========")
238
+
239
+ # Lưu mô hình và các thành phần cần thiết
240
+ model_output_dir = "./svm_emotion_model"
241
+ os.makedirs(model_output_dir, exist_ok=True)
242
+ joblib.dump(svm_classifier, os.path.join(model_output_dir, "svm_classifier.joblib"))
243
+ joblib.dump(vectorizer, os.path.join(model_output_dir, "tfidf_vectorizer.joblib"))
244
+ joblib.dump(id2label, os.path.join(model_output_dir, "id2label.json"))
245
+
246
+ print("\n========== Model and Vectorizer saved ==========")
247
+
248
+ # Predict 1 câu (ví dụ)
249
+ def predict_text(text):
250
+ text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
251
+ X = vectorizer.transform([text_proc])
252
+ pred_id = svm_classifier.predict(X)[0]
253
+ label = custom_id2label[pred_id]
254
+ return label
255
+
256
+ custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
257
+ emotion_pred = predict_text(custom_text)
258
+ print("\nCâu ví dụ:", custom_text)
259
+ print("Dự đoán cảm xúc:", emotion_pred)
260
+
261
+ print("\nHoàn thành demo SVM với cân bằng dữ liệu & nhiều chỉ số đánh giá!")
main_v1.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # thesis.py
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import pandas as pd
5
+ import emoji
6
+ import json
7
+ import re
8
+ from underthesea import word_tokenize
9
+ from tqdm import tqdm
10
+ import torch
11
+ from torchtext.vocab import Vectors
12
+ from sklearn.model_selection import train_test_split
13
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
14
+ from torch.utils.data import DataLoader, TensorDataset
15
+ import torch.nn as nn
16
+ import torch.optim as optim
17
+ import numpy as np
18
+ import tensorflow as tf
19
+
20
+ # ========== CÁC HÀM TIỀN XỬ LÝ ==========
21
+
22
+ def preprocess_sentence(sentence, abbreviations, emoji_mapping):
23
+ """
24
+ Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
25
+ ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
26
+ """
27
+ sentence = sentence.lower()
28
+ sentence = replace_emojis(sentence, emoji_mapping)
29
+ sentence = remove_profanity(sentence)
30
+ sentence = remove_special_characters(sentence)
31
+ sentence = normalize_whitespace(sentence)
32
+ sentence = replace_abbreviations(sentence, abbreviations)
33
+ sentence = remove_repeated_characters(sentence)
34
+ sentence = replace_numbers(sentence)
35
+ sentence = tokenize_sentence(sentence)
36
+ return sentence
37
+
38
+ def replace_emojis(sentence, emoji_mapping):
39
+ processed_sentence = []
40
+ for char in sentence:
41
+ if char in emoji_mapping:
42
+ processed_sentence.append(emoji_mapping[char])
43
+ elif not emoji.is_emoji(char):
44
+ processed_sentence.append(char)
45
+ return ''.join(processed_sentence)
46
+
47
+ def remove_profanity(sentence):
48
+ profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
49
+ words = sentence.split()
50
+ filtered_words = [word for word in words if word.lower() not in profane_words]
51
+ return ' '.join(filtered_words)
52
+
53
+ def remove_special_characters(sentence):
54
+ return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
55
+
56
+ def normalize_whitespace(sentence):
57
+ return ' '.join(sentence.split())
58
+
59
+ def replace_abbreviations(sentence, abbreviations):
60
+ words = sentence.split()
61
+ replaced_words = [
62
+ " ".join(abbreviations[word]) if word in abbreviations else word
63
+ for word in words
64
+ ]
65
+ return ' '.join(replaced_words)
66
+
67
+ def remove_repeated_characters(sentence):
68
+ return re.sub(r"(.)\1{2,}", r"\1", sentence)
69
+
70
+ def replace_numbers(sentence):
71
+ return re.sub(r"\d+", "[number]", sentence)
72
+
73
+ def tokenize_sentence(sentence):
74
+ return ' '.join(word_tokenize(sentence))
75
+
76
+
77
+ # ========== LỚP DATA MANAGER ==========
78
+
79
+ class DataManager:
80
+ def __init__(self, file_path, abbreviations_path, word2vec_path):
81
+ self.file_path = file_path
82
+ self.abbreviations_path = abbreviations_path
83
+ self.word2vec_path = word2vec_path
84
+ self.load_abbreviations()
85
+ self.load_word2vec()
86
+
87
+ def load_abbreviations(self):
88
+ with open(self.abbreviations_path, "r", encoding="utf-8") as file:
89
+ self.abbreviations = json.load(file)
90
+
91
+ def load_word2vec(self):
92
+ # Tải vector từ file word2vec, unk_init để từ vựng ngoài tập sẽ random normal
93
+ self.word_embeddings = Vectors(name=self.word2vec_path, unk_init=torch.Tensor.normal_)
94
+ self.vocabulary = self.create_vocab_from_word2vec()
95
+
96
+ def create_vocab_from_word2vec(self):
97
+ vocab = Vocabulary()
98
+ words_list = list(self.word_embeddings.stoi.keys())
99
+ for word in words_list:
100
+ vocab.add(word)
101
+ return vocab
102
+
103
+ def preprocess_data(self):
104
+ df = pd.read_excel(self.file_path)
105
+ if "Sentence" not in df.columns:
106
+ raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
107
+
108
+ # Tiền xử lý từng câu
109
+ df["processed_sentence"] = df["Sentence"].apply(
110
+ lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
111
+ )
112
+
113
+ # Loại bỏ những dòng rỗng sau khi xử lý
114
+ df = df[df["processed_sentence"].str.strip().astype(bool)]
115
+ return df
116
+
117
+ def split_and_convert(
118
+ self, df, label_column="Emotion", maxlen=400, test_size=0.2,
119
+ for_keras=False, batch_size=32
120
+ ):
121
+ """
122
+ Chia dữ liệu thành train/test. Trả về:
123
+ - Nếu for_keras=False: train_loader, test_loader, label_mapping (PyTorch)
124
+ - Nếu for_keras=True: X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
125
+ """
126
+
127
+ if label_column not in df.columns:
128
+ raise ValueError(
129
+ f"Cột '{label_column}' không tồn tại trong DataFrame. "
130
+ f"Các cột hiện có: {df.columns.tolist()}"
131
+ )
132
+
133
+ # Tạo mapping nhãn -> số
134
+ label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
135
+ df[label_column] = df[label_column].map(label_mapping)
136
+
137
+ X = df["processed_sentence"].tolist()
138
+ y = df[label_column].tolist()
139
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
140
+
141
+ # Chuyển văn bản thành tensor chỉ số
142
+ X_train_tensors = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
143
+ X_test_tensors = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
144
+
145
+ # Pad sequences
146
+ X_train_padded = pad_sequences(X_train_tensors, maxlen=maxlen)
147
+ X_test_padded = pad_sequences(X_test_tensors, maxlen=maxlen)
148
+
149
+ # Debug thông tin
150
+ print(">>> Debug Split and Convert:")
151
+ print("X_train_padded.shape:", X_train_padded.shape)
152
+ print("X_test_padded.shape: ", X_test_padded.shape)
153
+ print("y_train length:", len(y_train))
154
+ print("y_test length: ", len(y_test))
155
+
156
+ # Kiểm tra min/max token
157
+ max_token_train = np.max(X_train_padded) if X_train_padded.size > 0 else None
158
+ min_token_train = np.min(X_train_padded) if X_train_padded.size > 0 else None
159
+ max_token_test = np.max(X_test_padded) if X_test_padded.size > 0 else None
160
+ min_token_test = np.min(X_test_padded) if X_test_padded.size > 0 else None
161
+
162
+ vocab_size = len(self.vocabulary)
163
+ print(f"vocab_size: {vocab_size}")
164
+ print(f"max_token_train: {max_token_train}, min_token_train: {min_token_train}")
165
+ print(f"max_token_test: {max_token_test}, min_token_test: {min_token_test}")
166
+
167
+ if for_keras:
168
+ num_classes = len(label_mapping)
169
+ # One-hot cho nhãn
170
+ y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train), num_classes=num_classes).numpy()
171
+ y_test_onehot = torch.nn.functional.one_hot(torch.tensor(y_test), num_classes=num_classes).numpy()
172
+
173
+ # Debug
174
+ print("y_train_onehot.shape:", y_train_onehot.shape)
175
+ print("y_test_onehot.shape: ", y_test_onehot.shape)
176
+
177
+ return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
178
+ else:
179
+ # Trả về DataLoader cho PyTorch
180
+ X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
181
+ X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long)
182
+ y_train_tensor = torch.tensor(y_train, dtype=torch.long)
183
+ y_test_tensor = torch.tensor(y_test, dtype=torch.long)
184
+
185
+ train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
186
+ test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
187
+
188
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
189
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
190
+ return train_loader, test_loader, label_mapping
191
+
192
+
193
+ # ========== LỚP TỪ ĐIỂN (VOCABULARY) ==========
194
+
195
+ class Vocabulary:
196
+ def __init__(self):
197
+ self.word2id = {}
198
+ self.word2id['<pad>'] = 0
199
+ self.word2id['<unk>'] = 1
200
+ self.unk_id = self.word2id['<unk>']
201
+ self.id2word = {0: '<pad>', 1: '<unk>'}
202
+
203
+ def __getitem__(self, word):
204
+ return self.word2id.get(word, self.unk_id)
205
+
206
+ def __contains__(self, word):
207
+ return word in self.word2id
208
+
209
+ def __len__(self):
210
+ return len(self.word2id)
211
+
212
+ def lookup_tokens(self, word_indexes: list):
213
+ return [self.id2word[word_index] for word_index in word_indexes]
214
+
215
+ def add(self, word):
216
+ if word not in self:
217
+ word_index = len(self.word2id)
218
+ self.word2id[word] = word_index
219
+ self.id2word[word_index] = word
220
+ return word_index
221
+ else:
222
+ return self[word]
223
+
224
+ @staticmethod
225
+ def tokenize_corpus(corpus):
226
+ tokenized_corpus = []
227
+ for document in tqdm(corpus):
228
+ tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
229
+ tokenized_corpus.append(tokenized_document)
230
+ return tokenized_corpus
231
+
232
+ def corpus_to_tensor(self, corpus, is_tokenized=False):
233
+ tokenized_corpus = self.tokenize_corpus(corpus) if not is_tokenized else corpus
234
+ return [
235
+ [self[word] for word in document]
236
+ for document in tokenized_corpus
237
+ ]
238
+
239
+
240
+ # ========== MAPPING EMOJI => NHÃN ==========
241
+
242
+ emoji_mapping = {
243
+ "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
244
+ "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
245
+ "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
246
+ "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
247
+ "🤑": "[satisfaction]",
248
+ "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
249
+ "😏": "[sarcasm]",
250
+ "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
251
+ "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
252
+ "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
253
+ "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
254
+ "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
255
+ "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
256
+ "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
257
+ "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
258
+ "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
259
+ "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
260
+ }
261
+
262
+
263
+ # ========== ĐỊNH NGHĨA MÔ HÌNH RNN PYTORCH ==========
264
+
265
+ class SimpleRNN(nn.Module):
266
+ def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
267
+ super(SimpleRNN, self).__init__()
268
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
269
+ self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
270
+ self.fc = nn.Linear(hidden_dim, output_dim)
271
+
272
+ def forward(self, x):
273
+ embedded = self.embedding(x)
274
+ _, (hidden, _) = self.rnn(embedded)
275
+ return self.fc(hidden.squeeze(0))
276
+
277
+
278
+ # ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH PYTORCH RNN ==========
279
+
280
+ def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
281
+ model.eval()
282
+ with torch.no_grad():
283
+ processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
284
+ tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
285
+ text_tensor = torch.tensor(
286
+ pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400),
287
+ dtype=torch.long
288
+ ).to(device)
289
+
290
+ output = model(text_tensor)
291
+ _, predicted = torch.max(output, 1)
292
+ reverse_label_mapping = {v: k for k, v in label_mapping.items()}
293
+ return reverse_label_mapping[predicted.item()]
294
+
295
+
296
+ # ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH KERAS CNN-LSTM ==========
297
+
298
+ def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
299
+ processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
300
+ tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
301
+ text_tensor = pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400)
302
+ output = model.predict(text_tensor)
303
+ predicted = output.argmax(axis=1)[0]
304
+ reverse_label_mapping = {v: k for k, v in label_mapping.items()}
305
+ return reverse_label_mapping[predicted]
306
+
307
+
308
+ # ========== PHẦN MAIN (CHẠY THỬ) ==========
309
+
310
+ if __name__ == "__main__":
311
+ # --------------------------
312
+ # Thay đường dẫn tại đây:
313
+ # --------------------------
314
+ file_path = "train.xlsx" # file Excel gốc (chứa cột "Sentence", "Emotion", ...)
315
+ abbreviations_path = "abbreviations.json"
316
+ word2vec_path = "/home/datpham/datpham/thesis-ngtram/word2vec_vi_syllables_100dims.txt"
317
+ output_path = "processed.xlsx"
318
+
319
+ data_manager = DataManager(
320
+ file_path=file_path,
321
+ abbreviations_path=abbreviations_path,
322
+ word2vec_path=word2vec_path
323
+ )
324
+
325
+ # 1) Đọc và tiền xử lý
326
+ df = data_manager.preprocess_data()
327
+ print("Trước khi undersampling:")
328
+ print(df["Emotion"].value_counts())
329
+
330
+ # 2) UNDERSAMPLING (Ví dụ)
331
+ # Chỉnh lại tên emotion cụ thể cho phù hợp tập dữ liệu của bạn
332
+ df_enjoyment = df[df["Emotion"] == "Enjoyment"]
333
+ df_other = df[df["Emotion"] == "Other"]
334
+ df_anger = df[df["Emotion"] == "Anger"]
335
+ df_sadness = df[df["Emotion"] == "Sadness"]
336
+ df_disgust = df[df["Emotion"] == "Disgust"]
337
+ df_fear = df[df["Emotion"] == "Fear"]
338
+ df_surprise = df[df["Emotion"] == "Surprise"]
339
+
340
+ # Ví dụ: Chọn 2000 mẫu cho 'Enjoyment'
341
+ if len(df_enjoyment) > 2000:
342
+ df_enjoyment_undersampled = df_enjoyment.sample(n=2000, random_state=42)
343
+ else:
344
+ df_enjoyment_undersampled = df_enjoyment
345
+
346
+ df_balanced = pd.concat([
347
+ df_enjoyment_undersampled,
348
+ df_other,
349
+ df_anger,
350
+ df_sadness,
351
+ df_disgust,
352
+ df_fear,
353
+ df_surprise
354
+ ], axis=0)
355
+
356
+ df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
357
+ df = df_balanced
358
+
359
+ print("\nSau khi undersampling:")
360
+ print(df["Emotion"].value_counts())
361
+
362
+ df.to_excel(output_path, index=False)
363
+
364
+ # 3) Tạo data loader cho PyTorch
365
+ train_loader, test_loader, label_mapping = data_manager.split_and_convert(
366
+ df, label_column="Emotion", for_keras=False
367
+ )
368
+
369
+ vocab_size = len(data_manager.vocabulary)
370
+ embedding_dim = 100
371
+ hidden_dim = 128
372
+ output_dim = len(label_mapping)
373
+
374
+ model_rnn = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
375
+ criterion = nn.CrossEntropyLoss()
376
+ optimizer = optim.Adam(model_rnn.parameters())
377
+
378
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
379
+ model_rnn.to(device)
380
+
381
+ num_epochs = 20
382
+ for epoch in range(num_epochs):
383
+ model_rnn.train()
384
+ epoch_loss = 0
385
+ correct = 0
386
+ total = 0
387
+ for X_batch, y_batch in train_loader:
388
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
389
+
390
+ optimizer.zero_grad()
391
+ predictions = model_rnn(X_batch)
392
+ loss = criterion(predictions, y_batch)
393
+ loss.backward()
394
+ optimizer.step()
395
+
396
+ epoch_loss += loss.item()
397
+ _, predicted = torch.max(predictions, 1)
398
+ correct += (predicted == y_batch).sum().item()
399
+ total += y_batch.size(0)
400
+
401
+ print(f"Epoch {epoch+1}/{num_epochs}, "
402
+ f"Loss: {epoch_loss/len(train_loader):.4f}, "
403
+ f"Accuracy: {correct/total:.4f}")
404
+
405
+ # Đánh giá RNN trên test set
406
+ model_rnn.eval()
407
+ test_loss = 0
408
+ correct = 0
409
+ total = 0
410
+ with torch.no_grad():
411
+ for X_batch, y_batch in test_loader:
412
+ X_batch, y_batch = X_batch.to(device), y_batch.to(device)
413
+ predictions = model_rnn(X_batch)
414
+ loss = criterion(predictions, y_batch)
415
+ test_loss += loss.item()
416
+
417
+ _, predicted = torch.max(predictions, 1)
418
+ correct += (predicted == y_batch).sum().item()
419
+ total += y_batch.size(0)
420
+
421
+ print(f"Test Loss: {test_loss/len(test_loader):.4f}, "
422
+ f"Test Accuracy: {correct/total:.4f}")
423
+
424
+
425
+ # ========== CNN-LSTM (Keras) ==========
426
+
427
+ from keras.models import Model
428
+ from keras.layers import Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
429
+ from keras.optimizers import Adam
430
+ from keras.callbacks import ModelCheckpoint
431
+
432
+ print("Training CNN-LSTM...")
433
+
434
+ X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
435
+ df, label_column="Emotion", for_keras=True
436
+ )
437
+
438
+ maxlen = 400
439
+
440
+ input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
441
+ emb_layer = Embedding(len(data_manager.vocabulary), embedding_dim)(input_layer)
442
+
443
+ def max_1d(X):
444
+ return tf.reduce_max(X, axis=1)
445
+
446
+ con3_layer = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
447
+ pool_con3_layer = Lambda(max_1d, output_shape=(150,))(con3_layer)
448
+
449
+ con5_layer = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
450
+ pool_con5_layer = Lambda(max_1d, output_shape=(150,))(con5_layer)
451
+
452
+ lstm_layer = LSTM(128)(emb_layer)
453
+
454
+ cnn_lstm_layer = concatenate([pool_con3_layer, pool_con5_layer, lstm_layer])
455
+
456
+ dense_layer = Dense(100, activation='relu')(cnn_lstm_layer)
457
+ dropout_layer = Dropout(0.2)(dense_layer)
458
+ output_layer = Dense(len(label_mapping), activation='softmax')(dropout_layer)
459
+
460
+ model_cnn_lstm = Model(inputs=input_layer, outputs=output_layer)
461
+ model_cnn_lstm.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
462
+
463
+ checkpoint = ModelCheckpoint('cnn_lstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max')
464
+ model_cnn_lstm.fit(
465
+ X_train, y_train,
466
+ validation_data=(X_test, y_test),
467
+ batch_size=32,
468
+ epochs=20,
469
+ callbacks=[checkpoint]
470
+ )
471
+
472
+ model_cnn_lstm.save('cnn_lstm_model.keras')
473
+
474
+ loss, accuracy = model_cnn_lstm.evaluate(X_test, y_test)
475
+ print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
476
+
477
+ # Demo dự đoán 1 câu mới
478
+ custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
479
+
480
+ # RNN (PyTorch)
481
+ emotion_rnn = predict_emotion_rnn(model_rnn, custom_text, data_manager, label_mapping, device)
482
+ print(f"Predicted Emotion (RNN): {emotion_rnn}")
483
+
484
+ # CNN-LSTM (Keras)
485
+ cnn_lstm_model = tf.keras.models.load_model('cnn_lstm_model.keras')
486
+ emotion_cnn_lstm = predict_emotion_cnn_lstm(cnn_lstm_model, custom_text, data_manager, label_mapping)
487
+ print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
488
+
489
+ # Kiểm tra phiên bản TF, GPU
490
+ print("TF version:", tf.__version__)
491
+ print("GPU devices:", tf.config.list_physical_devices("GPU"))
492
+ # Có thể kiểm tra CUDA/GPU thông qua lệnh system sau (nếu muốn):
493
+ # import os
494
+ # os.system("nvidia-smi")
phobert_emotion_model/classification_report.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========== Classification Report ==========
2
+ precision recall f1-score support
3
+
4
+ Anger 0.9768 0.9788 0.9778 991
5
+ Disgust 0.9457 0.9657 0.9556 991
6
+ Enjoyment 0.9166 0.8204 0.8658 991
7
+ Fear 0.9771 0.9879 0.9825 992
8
+ Other 0.9026 0.9253 0.9138 991
9
+ Sadness 0.9302 0.9677 0.9486 991
10
+ Surprise 0.9448 0.9496 0.9472 992
11
+
12
+ accuracy 0.9422 6939
13
+ macro avg 0.9420 0.9422 0.9416 6939
14
+ weighted avg 0.9420 0.9422 0.9416 6939
15
+
16
+ ========== Confusion Matrix ==========
17
+ [[970 9 3 4 2 2 1]
18
+ [ 12 957 2 3 7 5 5]
19
+ [ 5 16 813 9 67 42 39]
20
+ [ 2 2 6 980 1 1 0]
21
+ [ 3 13 33 2 917 13 10]
22
+ [ 1 7 17 3 4 959 0]
23
+ [ 0 8 13 2 18 9 942]]
phobert_emotion_model/confusion_matrix.png ADDED
phobert_emotion_model/id2label.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "Anger",
3
+ "1": "Disgust",
4
+ "2": "Enjoyment",
5
+ "3": "Fear",
6
+ "4": "Other",
7
+ "5": "Sadness",
8
+ "6": "Surprise"
9
+ }
phobert_emotion_model/phobert_emotion_model/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 64000
3
+ }
phobert_emotion_model/phobert_emotion_model/bpe.codes ADDED
The diff for this file is too large to render. See raw diff
 
phobert_emotion_model/phobert_emotion_model/config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vinai/phobert-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2",
18
+ "3": "LABEL_3",
19
+ "4": "LABEL_4",
20
+ "5": "LABEL_5",
21
+ "6": "LABEL_6"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "LABEL_0": 0,
27
+ "LABEL_1": 1,
28
+ "LABEL_2": 2,
29
+ "LABEL_3": 3,
30
+ "LABEL_4": 4,
31
+ "LABEL_5": 5,
32
+ "LABEL_6": 6
33
+ },
34
+ "layer_norm_eps": 1e-05,
35
+ "max_position_embeddings": 258,
36
+ "model_type": "roberta",
37
+ "num_attention_heads": 12,
38
+ "num_hidden_layers": 12,
39
+ "pad_token_id": 1,
40
+ "position_embedding_type": "absolute",
41
+ "problem_type": "single_label_classification",
42
+ "tokenizer_class": "PhobertTokenizer",
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.40.0",
45
+ "type_vocab_size": 1,
46
+ "use_cache": true,
47
+ "vocab_size": 64001
48
+ }
phobert_emotion_model/phobert_emotion_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cc285ab489e07145436eebb67247d71cd67c817155cc65eb5a7e52e78ed4f0
3
+ size 540038764
phobert_emotion_model/phobert_emotion_model/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "</s>",
8
+ "unk_token": "<unk>"
9
+ }
phobert_emotion_model/phobert_emotion_model/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "64000": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "PhobertTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
phobert_emotion_model/phobert_emotion_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
phobert_results/checkpoint-10410/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<mask>": 64000
3
+ }
phobert_results/checkpoint-10410/bpe.codes ADDED
The diff for this file is too large to render. See raw diff