Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +10 -0
- .vector_cache/word2vec_vi_syllables_100dims.txt.pt +3 -0
- abbreviations.json +363 -0
- bilstm_best.keras +3 -0
- bilstm_emotion_model/bilstm_model.keras +3 -0
- bilstm_emotion_model/classification_report.txt +33 -0
- bilstm_emotion_model/label_mapping.json +9 -0
- bilstm_emotion_model/vocabulary.json +0 -0
- cnn_lstm_best.keras +3 -0
- cnn_lstm_emotion_model/classification_report.txt +33 -0
- cnn_lstm_emotion_model/cnn_lstm_model.keras +3 -0
- cnn_lstm_model.keras +3 -0
- flagged/log.csv +2 -0
- logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0 +3 -0
- logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1 +3 -0
- logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0 +3 -0
- logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0 +3 -0
- logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0 +3 -0
- logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0 +3 -0
- logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0 +3 -0
- logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0 +3 -0
- logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1 +3 -0
- logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0 +3 -0
- logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0 +3 -0
- logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1 +3 -0
- logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0 +3 -0
- logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0 +3 -0
- logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1 +3 -0
- logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0 +3 -0
- logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0 +3 -0
- logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0 +3 -0
- logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1 +3 -0
- main_BILSTM.py +573 -0
- main_RNN_CNN-LSTM.py +738 -0
- main_lstm.py +289 -0
- main_phobert.py +349 -0
- main_svm.py +261 -0
- main_v1.py +494 -0
- phobert_emotion_model/classification_report.txt +23 -0
- phobert_emotion_model/confusion_matrix.png +0 -0
- phobert_emotion_model/id2label.json +9 -0
- phobert_emotion_model/phobert_emotion_model/added_tokens.json +3 -0
- phobert_emotion_model/phobert_emotion_model/bpe.codes +0 -0
- phobert_emotion_model/phobert_emotion_model/config.json +48 -0
- phobert_emotion_model/phobert_emotion_model/model.safetensors +3 -0
- phobert_emotion_model/phobert_emotion_model/special_tokens_map.json +9 -0
- phobert_emotion_model/phobert_emotion_model/tokenizer_config.json +54 -0
- phobert_emotion_model/phobert_emotion_model/vocab.txt +0 -0
- phobert_results/checkpoint-10410/added_tokens.json +3 -0
- phobert_results/checkpoint-10410/bpe.codes +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
bilstm_best.keras filter=lfs diff=lfs merge=lfs -text
|
37 |
+
bilstm_emotion_model/bilstm_model.keras filter=lfs diff=lfs merge=lfs -text
|
38 |
+
cnn_lstm_best.keras filter=lfs diff=lfs merge=lfs -text
|
39 |
+
cnn_lstm_emotion_model/cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
|
40 |
+
cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
|
41 |
+
processed.xlsx filter=lfs diff=lfs merge=lfs -text
|
42 |
+
processed_phobert.xlsx filter=lfs diff=lfs merge=lfs -text
|
43 |
+
processed_svm.xlsx filter=lfs diff=lfs merge=lfs -text
|
44 |
+
train.xlsx filter=lfs diff=lfs merge=lfs -text
|
45 |
+
word2vec_vi_syllables_100dims.txt filter=lfs diff=lfs merge=lfs -text
|
.vector_cache/word2vec_vi_syllables_100dims.txt.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3390520329ebe14cddb38384d80bd8b6e4948e023977ba5dbe32235b4a3503e7
|
3 |
+
size 418631353
|
abbreviations.json
ADDED
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"ad": [
|
3 |
+
"admin",
|
4 |
+
"quản trị viên"
|
5 |
+
],
|
6 |
+
"bb": [
|
7 |
+
"bye bye",
|
8 |
+
"tạm biệt"
|
9 |
+
],
|
10 |
+
"bl": [
|
11 |
+
"bình luận"
|
12 |
+
],
|
13 |
+
"bth": [
|
14 |
+
"bình thường"
|
15 |
+
],
|
16 |
+
"bmn": [
|
17 |
+
"bạn muốn"
|
18 |
+
],
|
19 |
+
"cxk": [
|
20 |
+
"cũng không"
|
21 |
+
],
|
22 |
+
"đm": [
|
23 |
+
"đ** m**"
|
24 |
+
],
|
25 |
+
"gg": [
|
26 |
+
"good game",
|
27 |
+
"Google"
|
28 |
+
],
|
29 |
+
"hc": [
|
30 |
+
"học"
|
31 |
+
],
|
32 |
+
"kq": [
|
33 |
+
"kết quả"
|
34 |
+
],
|
35 |
+
"kb": [
|
36 |
+
"kết bạn"
|
37 |
+
],
|
38 |
+
"khá": [
|
39 |
+
"khá là"
|
40 |
+
],
|
41 |
+
"lq": [
|
42 |
+
"liên quan"
|
43 |
+
],
|
44 |
+
"lmh": [
|
45 |
+
"làm gì thế"
|
46 |
+
],
|
47 |
+
"ng": [
|
48 |
+
"người"
|
49 |
+
],
|
50 |
+
"nsao": [
|
51 |
+
"nói sao"
|
52 |
+
],
|
53 |
+
"nv": [
|
54 |
+
"nhân vật"
|
55 |
+
],
|
56 |
+
"nvay": [
|
57 |
+
"như vậy"
|
58 |
+
],
|
59 |
+
"nxk": [
|
60 |
+
"nói không"
|
61 |
+
],
|
62 |
+
"ob": [
|
63 |
+
"ông bà"
|
64 |
+
],
|
65 |
+
"pc": [
|
66 |
+
"phải không"
|
67 |
+
],
|
68 |
+
"ph": [
|
69 |
+
"phim"
|
70 |
+
],
|
71 |
+
"ql": [
|
72 |
+
"quản lý"
|
73 |
+
],
|
74 |
+
"qt": [
|
75 |
+
"quá trời"
|
76 |
+
],
|
77 |
+
"sdt": [
|
78 |
+
"số điện thoại"
|
79 |
+
],
|
80 |
+
"sk": [
|
81 |
+
"sức khỏe"
|
82 |
+
],
|
83 |
+
"tc": [
|
84 |
+
"tài chính"
|
85 |
+
],
|
86 |
+
"td": [
|
87 |
+
"tâm điểm",
|
88 |
+
"tập đoàn"
|
89 |
+
],
|
90 |
+
"th": [
|
91 |
+
"thôi"
|
92 |
+
],
|
93 |
+
"tl": [
|
94 |
+
"trả lời"
|
95 |
+
],
|
96 |
+
"ty": [
|
97 |
+
"tình yêu"
|
98 |
+
],
|
99 |
+
"up": [
|
100 |
+
"cập nhật",
|
101 |
+
"update"
|
102 |
+
],
|
103 |
+
"xđ": [
|
104 |
+
"xác định"
|
105 |
+
],
|
106 |
+
"zui": [
|
107 |
+
"vui"
|
108 |
+
],
|
109 |
+
"zời": [
|
110 |
+
"trời"
|
111 |
+
],
|
112 |
+
"hdsd": [
|
113 |
+
"hướng dẫn sử dụng"
|
114 |
+
],
|
115 |
+
"bbq": [
|
116 |
+
"barbecue",
|
117 |
+
"tiệc nướng"
|
118 |
+
],
|
119 |
+
"cx": [
|
120 |
+
"chắc chắn",
|
121 |
+
"cũng"
|
122 |
+
],
|
123 |
+
"vkc": [
|
124 |
+
"vãi kinh"
|
125 |
+
],
|
126 |
+
"kt": [
|
127 |
+
"kiểm tra",
|
128 |
+
"không thèm"
|
129 |
+
],
|
130 |
+
"tks": [
|
131 |
+
"thanks",
|
132 |
+
"cảm ơn"
|
133 |
+
],
|
134 |
+
"đg": [
|
135 |
+
"đang"
|
136 |
+
],
|
137 |
+
"qa": [
|
138 |
+
"quá"
|
139 |
+
],
|
140 |
+
"ht": [
|
141 |
+
"học tập",
|
142 |
+
"hoàn tất"
|
143 |
+
],
|
144 |
+
"clgt": [
|
145 |
+
"cái l** gì thế"
|
146 |
+
],
|
147 |
+
"pls": [
|
148 |
+
"please",
|
149 |
+
"làm ơn"
|
150 |
+
],
|
151 |
+
"qtqđ": [
|
152 |
+
"quá trời quá đất"
|
153 |
+
],
|
154 |
+
"klq": [
|
155 |
+
"không liên quan"
|
156 |
+
],
|
157 |
+
"mn": [
|
158 |
+
"mọi người"
|
159 |
+
],
|
160 |
+
"vc": [
|
161 |
+
"vãi chưởng",
|
162 |
+
"vợ chồng"
|
163 |
+
],
|
164 |
+
"vch": [
|
165 |
+
"vãi chưởng"
|
166 |
+
],
|
167 |
+
"cđ": [
|
168 |
+
"cuộc đời"
|
169 |
+
],
|
170 |
+
"đhs": [
|
171 |
+
"đ** hiểu sao"
|
172 |
+
],
|
173 |
+
"ib": [
|
174 |
+
"inbox",
|
175 |
+
"nhắn tin"
|
176 |
+
],
|
177 |
+
"ttyl": [
|
178 |
+
"talk to you later",
|
179 |
+
"nói chuyện sau"
|
180 |
+
],
|
181 |
+
"stt": [
|
182 |
+
"status",
|
183 |
+
"trạng thái"
|
184 |
+
],
|
185 |
+
"sr": [
|
186 |
+
"sorry",
|
187 |
+
"xin lỗi"
|
188 |
+
],
|
189 |
+
"bn": [
|
190 |
+
"bao nhiêu",
|
191 |
+
"bạn"
|
192 |
+
],
|
193 |
+
"ckmnl": [
|
194 |
+
"chào cả nhà mình nha l"
|
195 |
+
],
|
196 |
+
"cr": [
|
197 |
+
"crush"
|
198 |
+
],
|
199 |
+
"mng": [
|
200 |
+
"mọi người"
|
201 |
+
],
|
202 |
+
"vl": [
|
203 |
+
"vãi l",
|
204 |
+
"rất"
|
205 |
+
],
|
206 |
+
"khbn": [
|
207 |
+
"không biết nữa"
|
208 |
+
],
|
209 |
+
"qtq": [
|
210 |
+
"quá trời quá"
|
211 |
+
],
|
212 |
+
"sml": [
|
213 |
+
"sấp mặt luôn"
|
214 |
+
],
|
215 |
+
"ns": [
|
216 |
+
"nói"
|
217 |
+
],
|
218 |
+
"ăn h": [
|
219 |
+
"ăn hành"
|
220 |
+
],
|
221 |
+
"qh": [
|
222 |
+
"quan hệ"
|
223 |
+
],
|
224 |
+
"ăn b": [
|
225 |
+
"ăn bánh"
|
226 |
+
],
|
227 |
+
"hph": [
|
228 |
+
"hạnh phúc"
|
229 |
+
],
|
230 |
+
"ngta": [
|
231 |
+
"người ta"
|
232 |
+
],
|
233 |
+
"mnk": [
|
234 |
+
"mọi người không"
|
235 |
+
],
|
236 |
+
"ahihi": [
|
237 |
+
"cười đùa"
|
238 |
+
],
|
239 |
+
"chz": [
|
240 |
+
"chuyện"
|
241 |
+
],
|
242 |
+
"vđ": [
|
243 |
+
"vấn đề"
|
244 |
+
],
|
245 |
+
"pp": [
|
246 |
+
"bye bye",
|
247 |
+
"tạm biệt"
|
248 |
+
],
|
249 |
+
"dc": [
|
250 |
+
"được"
|
251 |
+
],
|
252 |
+
"nt": [
|
253 |
+
"nhắn tin"
|
254 |
+
],
|
255 |
+
"thik": [
|
256 |
+
"thích"
|
257 |
+
],
|
258 |
+
"bt": [
|
259 |
+
"biết",
|
260 |
+
"bình thường"
|
261 |
+
],
|
262 |
+
"kp": [
|
263 |
+
"không phải"
|
264 |
+
],
|
265 |
+
"mik": [
|
266 |
+
"mình"
|
267 |
+
],
|
268 |
+
"lm": [
|
269 |
+
"làm"
|
270 |
+
],
|
271 |
+
"nx": [
|
272 |
+
"nữa"
|
273 |
+
],
|
274 |
+
"mk": [
|
275 |
+
"mình",
|
276 |
+
"mày"
|
277 |
+
],
|
278 |
+
"cmt": [
|
279 |
+
"comment",
|
280 |
+
"bình luận"
|
281 |
+
],
|
282 |
+
"rep": [
|
283 |
+
"trả lời",
|
284 |
+
"phản hồi"
|
285 |
+
],
|
286 |
+
"fa": [
|
287 |
+
"độc thân",
|
288 |
+
"forever alone"
|
289 |
+
],
|
290 |
+
"chx": [
|
291 |
+
"chưa"
|
292 |
+
],
|
293 |
+
"qlq": [
|
294 |
+
"quản lý quán"
|
295 |
+
],
|
296 |
+
"a": [
|
297 |
+
"anh"
|
298 |
+
],
|
299 |
+
"e": [
|
300 |
+
"em"
|
301 |
+
],
|
302 |
+
"ko": [
|
303 |
+
"không"
|
304 |
+
],
|
305 |
+
"kh": [
|
306 |
+
"không"
|
307 |
+
],
|
308 |
+
"z": [
|
309 |
+
"vậy"
|
310 |
+
],
|
311 |
+
"ny": [
|
312 |
+
"người yêu"
|
313 |
+
],
|
314 |
+
"l": [
|
315 |
+
"là"
|
316 |
+
],
|
317 |
+
"sn": [
|
318 |
+
"sinh nhật"
|
319 |
+
],
|
320 |
+
"ckk": [
|
321 |
+
"chúc ngủ ngon"
|
322 |
+
],
|
323 |
+
"hpbd": [
|
324 |
+
"happy birthday"
|
325 |
+
],
|
326 |
+
"tt": [
|
327 |
+
"thông tin",
|
328 |
+
"tương tác"
|
329 |
+
],
|
330 |
+
"ms": [
|
331 |
+
"mới"
|
332 |
+
],
|
333 |
+
"k": [
|
334 |
+
"không"
|
335 |
+
],
|
336 |
+
"vk": [
|
337 |
+
"vợ"
|
338 |
+
],
|
339 |
+
"ck": [
|
340 |
+
"chồng"
|
341 |
+
],
|
342 |
+
"j": [
|
343 |
+
"gì"
|
344 |
+
],
|
345 |
+
"m": [
|
346 |
+
"mày"
|
347 |
+
],
|
348 |
+
"t": [
|
349 |
+
"tao"
|
350 |
+
],
|
351 |
+
"sgk": [
|
352 |
+
"sách giáo khoa"
|
353 |
+
],
|
354 |
+
"cv": [
|
355 |
+
"công việc"
|
356 |
+
],
|
357 |
+
"pv": [
|
358 |
+
"phục vụ"
|
359 |
+
],
|
360 |
+
"dth":["dễ thương"],
|
361 |
+
"gato": ["ghen ăn tức ở"]
|
362 |
+
|
363 |
+
}
|
bilstm_best.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:085cb3b7394a3db69287c6ede56834dfc9d6e56e2f169c5a05e49ffb5267fb6a
|
3 |
+
size 13203552
|
bilstm_emotion_model/bilstm_model.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40715c89bc3bc193a953c792527898450dd10979bd0bcd62ed32b8df471fa2bb
|
3 |
+
size 13203552
|
bilstm_emotion_model/classification_report.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
========== BiLSTM Classification Report ==========
|
2 |
+
precision recall f1-score support
|
3 |
+
|
4 |
+
Enjoyment 0.6490 0.7296 0.6869 991
|
5 |
+
Fear 0.5580 0.4709 0.5108 327
|
6 |
+
Sadness 0.4580 0.4747 0.4662 356
|
7 |
+
Anger 0.6587 0.6748 0.6667 369
|
8 |
+
Other 0.6601 0.6733 0.6667 600
|
9 |
+
Disgust 0.4967 0.4488 0.4715 332
|
10 |
+
Surprise 0.4683 0.3620 0.4083 326
|
11 |
+
|
12 |
+
accuracy 0.5956 3301
|
13 |
+
macro avg 0.5641 0.5477 0.5539 3301
|
14 |
+
weighted avg 0.5893 0.5956 0.5905 3301
|
15 |
+
|
16 |
+
========== Additional Metrics ==========
|
17 |
+
Test Loss: 2.0363
|
18 |
+
Test Accuracy: 0.5956
|
19 |
+
Precision (Macro): 0.5641
|
20 |
+
Precision (Weighted): 0.5893
|
21 |
+
Recall (Macro): 0.5477
|
22 |
+
Recall (Weighted): 0.5956
|
23 |
+
F1-Score (Macro): 0.5539
|
24 |
+
F1-Score (Weighted): 0.5905
|
25 |
+
|
26 |
+
========== Confusion Matrix ==========
|
27 |
+
[[723 23 83 3 81 29 49]
|
28 |
+
[ 38 154 26 72 10 14 13]
|
29 |
+
[108 14 169 2 30 23 10]
|
30 |
+
[ 13 42 12 249 14 29 10]
|
31 |
+
[110 9 30 9 404 18 20]
|
32 |
+
[ 32 25 26 30 38 149 32]
|
33 |
+
[ 90 9 23 13 35 38 118]]
|
bilstm_emotion_model/label_mapping.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Enjoyment": 0,
|
3 |
+
"Fear": 1,
|
4 |
+
"Sadness": 2,
|
5 |
+
"Anger": 3,
|
6 |
+
"Other": 4,
|
7 |
+
"Disgust": 5,
|
8 |
+
"Surprise": 6
|
9 |
+
}
|
bilstm_emotion_model/vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
cnn_lstm_best.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e98590341cdfcc831873ee3fddc3c17f16a350085df1e302e2e22a4eda0c03ad
|
3 |
+
size 13535600
|
cnn_lstm_emotion_model/classification_report.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
========== CNN-LSTM Classification Report ==========
|
2 |
+
precision recall f1-score support
|
3 |
+
|
4 |
+
Enjoyment 0.6977 0.7265 0.7118 991
|
5 |
+
Fear 0.5526 0.6269 0.5874 327
|
6 |
+
Sadness 0.4955 0.4663 0.4805 356
|
7 |
+
Anger 0.7022 0.6070 0.6512 369
|
8 |
+
Other 0.6740 0.7650 0.7166 600
|
9 |
+
Disgust 0.5194 0.4849 0.5016 332
|
10 |
+
Surprise 0.5020 0.3896 0.4387 326
|
11 |
+
|
12 |
+
accuracy 0.6247 3301
|
13 |
+
macro avg 0.5919 0.5809 0.5840 3301
|
14 |
+
weighted avg 0.6204 0.6247 0.6205 3301
|
15 |
+
|
16 |
+
========== Additional Metrics ==========
|
17 |
+
Test Loss: 1.6124
|
18 |
+
Test Accuracy: 0.6247
|
19 |
+
Precision (Macro): 0.5919
|
20 |
+
Precision (Weighted): 0.6204
|
21 |
+
Recall (Macro): 0.5809
|
22 |
+
Recall (Weighted): 0.6247
|
23 |
+
F1-Score (Macro): 0.5840
|
24 |
+
F1-Score (Weighted): 0.6205
|
25 |
+
|
26 |
+
========== Confusion Matrix ==========
|
27 |
+
[[720 28 69 11 93 37 33]
|
28 |
+
[ 34 205 13 39 10 14 12]
|
29 |
+
[ 92 22 166 7 31 19 19]
|
30 |
+
[ 13 62 13 224 17 34 6]
|
31 |
+
[ 56 15 29 6 459 10 25]
|
32 |
+
[ 34 21 22 27 36 161 31]
|
33 |
+
[ 83 18 23 5 35 35 127]]
|
cnn_lstm_emotion_model/cnn_lstm_model.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c45256b322b2360c9ba9e0c5da5fd42705f7d4395f6c1d4c6a94035e43bf05d0
|
3 |
+
size 13535600
|
cnn_lstm_model.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78c966f03f234f409270b699f84a635d98128de271d8492ee25776026312cd24
|
3 |
+
size 13535600
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Nhập câu cần phân loại cảm xúc,Kết quả dự đoán,flag,username,timestamp
|
2 |
+
"Hôm nay là ngày đẹp trời, tôi muốn có người yêu 😊",Disgust,,,2025-01-14 13:57:25.419643
|
logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aeb26f251abccb92c7342c443b6b7c7faa2b0d0c41976053706f1c002754680a
|
3 |
+
size 23650
|
logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72bc950b1e422eb9db07cba8ad85db543521c38025579fcc2cce1dd799313233
|
3 |
+
size 411
|
logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:744768bef1c4f7e54446c6a7925c8b770d2d5af70f6f76016fab9805a3802b6f
|
3 |
+
size 346
|
logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0843cbd924008b8a37ef65480d32b8e16241e9e059a3784b0b8ce6d097a0d0c5
|
3 |
+
size 346
|
logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c3fc1113ddc32236fc69e785dfa73481178e728dd02e131bad5add13004729f
|
3 |
+
size 346
|
logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3de874ab406b8d42e3f02443b3ae8fce7228cffb61c6845aab400981d1263b0
|
3 |
+
size 5228
|
logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f927f800053a89cf20a14bf5a48c6343b31d9a49d5e670a4fc48ad7fb676874
|
3 |
+
size 8712
|
logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f2816f60b911788c30bc43168dbbe689eee10a119e1e450767e54f521cb5f03c
|
3 |
+
size 81906
|
logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:426ac92bb076d56fd8130e04ac0064542681f9ddd70fbeb64779f10b8521bb1d
|
3 |
+
size 417
|
logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ff2d9a713d3ea47e04c6361df3c62d551e983cd170de4a163798e58eed51111
|
3 |
+
size 346
|
logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2cea1a1f21eb664b3b5ae8f09ae76a38a3c7a37560a4432c805772a8afb171b
|
3 |
+
size 83399
|
logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4e9817a200d06938057f30fdac643b1480e734857bb5337aa4f494b29d199245
|
3 |
+
size 569
|
logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:48134c412b09adeae17bc7aac0295e48dce80cf72ce2a1f4109c159ee99819b1
|
3 |
+
size 486
|
logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0165be0e6c2731ce32b3e3cbe11b5a6997120211c06d0d04c264b5c69c8f9f2
|
3 |
+
size 83399
|
logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e468b0b65d952e3df6c9eb4f53bb8a8f867532828522b13b8229b53ea2787f9a
|
3 |
+
size 569
|
logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a67cb94b4913d02142ea7fb0bbad62005700059dc0bc6670464999d33dce0daf
|
3 |
+
size 7756
|
logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a882fae8ea63fa2ecf17da9e9c44bcd33568c5a998b11da0ceb6c537857223c
|
3 |
+
size 7367
|
logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1dcadbf84e08ca0d1c9cf9f877233b857eb144b8aa92bd28291827220a0f7ea6
|
3 |
+
size 85351
|
logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6eee809e23d4dd927f9c3dffb75d8184a24ae246cd0380fc93894bccc415d632
|
3 |
+
size 766
|
main_BILSTM.py
ADDED
@@ -0,0 +1,573 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# thesis.py
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
import numpy as np
|
9 |
+
from underthesea import word_tokenize
|
10 |
+
from tqdm import tqdm
|
11 |
+
import torch
|
12 |
+
from torchtext.vocab import Vectors
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from sklearn.utils import resample
|
15 |
+
from sklearn.metrics import (
|
16 |
+
accuracy_score,
|
17 |
+
classification_report,
|
18 |
+
precision_score,
|
19 |
+
recall_score,
|
20 |
+
f1_score,
|
21 |
+
confusion_matrix
|
22 |
+
)
|
23 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
24 |
+
from torch.utils.data import DataLoader, TensorDataset
|
25 |
+
import torch.nn as nn
|
26 |
+
import torch.optim as optim
|
27 |
+
import tensorflow as tf
|
28 |
+
import os
|
29 |
+
|
30 |
+
# ========== CÁC HÀM TIỀN XỬ LÝ ==========
|
31 |
+
|
32 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
33 |
+
"""
|
34 |
+
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
|
35 |
+
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
|
36 |
+
"""
|
37 |
+
sentence = sentence.lower()
|
38 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
39 |
+
sentence = remove_profanity(sentence)
|
40 |
+
sentence = remove_special_characters(sentence)
|
41 |
+
sentence = normalize_whitespace(sentence)
|
42 |
+
sentence = replace_abbreviations(sentence, abbreviations)
|
43 |
+
sentence = remove_repeated_characters(sentence)
|
44 |
+
sentence = replace_numbers(sentence)
|
45 |
+
sentence = tokenize_sentence(sentence)
|
46 |
+
return sentence
|
47 |
+
|
48 |
+
def replace_emojis(sentence, emoji_mapping):
|
49 |
+
processed_sentence = []
|
50 |
+
for char in sentence:
|
51 |
+
if char in emoji_mapping:
|
52 |
+
processed_sentence.append(emoji_mapping[char])
|
53 |
+
elif not emoji.is_emoji(char):
|
54 |
+
processed_sentence.append(char)
|
55 |
+
return ''.join(processed_sentence)
|
56 |
+
|
57 |
+
def remove_profanity(sentence):
|
58 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
59 |
+
words = sentence.split()
|
60 |
+
filtered_words = [word for word in words if word.lower() not in profane_words]
|
61 |
+
return ' '.join(filtered_words)
|
62 |
+
|
63 |
+
def remove_special_characters(sentence):
|
64 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
65 |
+
|
66 |
+
def normalize_whitespace(sentence):
|
67 |
+
return ' '.join(sentence.split())
|
68 |
+
|
69 |
+
def replace_abbreviations(sentence, abbreviations):
|
70 |
+
words = sentence.split()
|
71 |
+
replaced_words = [
|
72 |
+
" ".join(abbreviations[word]) if word in abbreviations else word
|
73 |
+
for word in words
|
74 |
+
]
|
75 |
+
return ' '.join(replaced_words)
|
76 |
+
|
77 |
+
def remove_repeated_characters(sentence):
|
78 |
+
# Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
|
79 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
80 |
+
|
81 |
+
def replace_numbers(sentence):
|
82 |
+
# Thay toàn bộ số bằng token [number]
|
83 |
+
return re.sub(r"\d+", "[number]", sentence)
|
84 |
+
|
85 |
+
def tokenize_sentence(sentence):
|
86 |
+
# Tách từ bằng underthesea
|
87 |
+
return ' '.join(word_tokenize(sentence))
|
88 |
+
|
89 |
+
# ========== VOCABULARY CLASS ==========
|
90 |
+
|
91 |
+
class Vocabulary:
|
92 |
+
def __init__(self):
|
93 |
+
self.word2id = {}
|
94 |
+
self.word2id['<pad>'] = 0
|
95 |
+
self.word2id['<unk>'] = 1
|
96 |
+
self.unk_id = 1
|
97 |
+
self.id2word = {0: '<pad>', 1: '<unk>'}
|
98 |
+
|
99 |
+
def __getitem__(self, word):
|
100 |
+
return self.word2id.get(word, self.unk_id)
|
101 |
+
|
102 |
+
def __contains__(self, word):
|
103 |
+
return word in self.word2id
|
104 |
+
|
105 |
+
def __len__(self):
|
106 |
+
return len(self.word2id)
|
107 |
+
|
108 |
+
def lookup_tokens(self, indices):
|
109 |
+
return [self.id2word[idx] for idx in indices]
|
110 |
+
|
111 |
+
def add(self, word):
|
112 |
+
if word not in self.word2id:
|
113 |
+
idx = len(self.word2id)
|
114 |
+
self.word2id[word] = idx
|
115 |
+
self.id2word[idx] = word
|
116 |
+
|
117 |
+
@staticmethod
|
118 |
+
def tokenize_corpus(corpus):
|
119 |
+
tokenized_corpus = []
|
120 |
+
for doc in tqdm(corpus, desc="Tokenizing Corpus"):
|
121 |
+
tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
|
122 |
+
tokenized_corpus.append(tokens)
|
123 |
+
return tokenized_corpus
|
124 |
+
|
125 |
+
def corpus_to_tensor(self, corpus, is_tokenized=False):
|
126 |
+
"""
|
127 |
+
corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
|
128 |
+
return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
|
129 |
+
"""
|
130 |
+
tokenized_corpus = (
|
131 |
+
self.tokenize_corpus(corpus) if not is_tokenized else corpus
|
132 |
+
)
|
133 |
+
return [
|
134 |
+
[self[token] for token in doc]
|
135 |
+
for doc in tokenized_corpus
|
136 |
+
]
|
137 |
+
|
138 |
+
# ========== EMOJI MAPPING ==========
|
139 |
+
|
140 |
+
emoji_mapping = {
|
141 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
142 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
143 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
144 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
145 |
+
"🤑": "[satisfaction]",
|
146 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "���": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
147 |
+
"😏": "[sarcasm]",
|
148 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
149 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
150 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
151 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
152 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
153 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
154 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
155 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
156 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
157 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
158 |
+
}
|
159 |
+
|
160 |
+
# ========== DATA MANAGER ==========
|
161 |
+
|
162 |
+
class DataManager:
|
163 |
+
def __init__(self, file_path, abbreviations_path, word2vec_path):
|
164 |
+
self.file_path = file_path
|
165 |
+
self.abbreviations_path = abbreviations_path
|
166 |
+
self.word2vec_path = word2vec_path
|
167 |
+
self.vocabulary = None
|
168 |
+
self.word_embeddings = None
|
169 |
+
self.abbreviations = None
|
170 |
+
self.load_abbreviations()
|
171 |
+
|
172 |
+
def load_abbreviations(self):
|
173 |
+
with open(self.abbreviations_path, "r", encoding="utf-8") as f:
|
174 |
+
self.abbreviations = json.load(f)
|
175 |
+
|
176 |
+
def load_word2vec(self):
|
177 |
+
"""
|
178 |
+
Tải vector từ file word2vec,
|
179 |
+
dùng torchtext.Vectors để load embedding pretrained.
|
180 |
+
"""
|
181 |
+
self.word_embeddings = Vectors(
|
182 |
+
name=self.word2vec_path,
|
183 |
+
unk_init=torch.Tensor.normal_
|
184 |
+
)
|
185 |
+
|
186 |
+
def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
|
187 |
+
"""
|
188 |
+
Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
|
189 |
+
"""
|
190 |
+
vocab = Vocabulary()
|
191 |
+
from collections import Counter
|
192 |
+
counter = Counter()
|
193 |
+
|
194 |
+
for sent in corpus:
|
195 |
+
for token in sent.split():
|
196 |
+
counter[token] += 1
|
197 |
+
|
198 |
+
most_common = counter.most_common(max_vocab_size)
|
199 |
+
for word, _freq in most_common:
|
200 |
+
vocab.add(word)
|
201 |
+
|
202 |
+
return vocab
|
203 |
+
|
204 |
+
def preprocess_data(self):
|
205 |
+
df = pd.read_excel(self.file_path)
|
206 |
+
if "Sentence" not in df.columns:
|
207 |
+
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
|
208 |
+
|
209 |
+
# Tiền xử lý từng câu
|
210 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
211 |
+
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
|
212 |
+
)
|
213 |
+
|
214 |
+
# Loại những dòng rỗng
|
215 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
216 |
+
|
217 |
+
# Tạo vocab từ chính dữ liệu
|
218 |
+
all_sentences = df["processed_sentence"].tolist()
|
219 |
+
self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
|
220 |
+
|
221 |
+
# Load word2vec
|
222 |
+
self.load_word2vec()
|
223 |
+
|
224 |
+
return df
|
225 |
+
|
226 |
+
def build_pretrained_embedding_matrix(self, embedding_dim=100):
|
227 |
+
"""
|
228 |
+
Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
|
229 |
+
với trọng số pretrained.
|
230 |
+
"""
|
231 |
+
vocab_size = len(self.vocabulary)
|
232 |
+
weight_matrix = np.random.normal(
|
233 |
+
scale=0.1, size=(vocab_size, embedding_dim)
|
234 |
+
).astype(np.float32)
|
235 |
+
|
236 |
+
# Copy vector pretrained
|
237 |
+
for word, idx in self.vocabulary.word2id.items():
|
238 |
+
if word in self.word_embeddings.stoi:
|
239 |
+
weight_matrix[idx] = self.word_embeddings.vectors[
|
240 |
+
self.word_embeddings.stoi[word]
|
241 |
+
]
|
242 |
+
|
243 |
+
return weight_matrix
|
244 |
+
|
245 |
+
def split_and_convert(
|
246 |
+
self, df, label_column="Emotion", maxlen=400, test_size=0.2,
|
247 |
+
for_keras=False, batch_size=32
|
248 |
+
):
|
249 |
+
"""
|
250 |
+
Chia dữ liệu thành train/test hoặc train/val/test.
|
251 |
+
- for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
|
252 |
+
- for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
|
253 |
+
"""
|
254 |
+
if label_column not in df.columns:
|
255 |
+
raise ValueError(
|
256 |
+
f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
|
257 |
+
)
|
258 |
+
|
259 |
+
# Tạo mapping nhãn -> số
|
260 |
+
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
|
261 |
+
df[label_column] = df[label_column].map(label_mapping)
|
262 |
+
if df[label_column].isnull().any():
|
263 |
+
missing = df[df[label_column].isnull()][label_column].unique()
|
264 |
+
raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
|
265 |
+
|
266 |
+
X = df["processed_sentence"].tolist()
|
267 |
+
y = df[label_column].tolist()
|
268 |
+
|
269 |
+
# Stratify để duy trì phân phối lớp
|
270 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
271 |
+
X, y, test_size=test_size, random_state=42, stratify=y
|
272 |
+
)
|
273 |
+
|
274 |
+
if not for_keras:
|
275 |
+
# Chia train thành train và validation
|
276 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
277 |
+
X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
|
278 |
+
)
|
279 |
+
|
280 |
+
# Convert text -> index
|
281 |
+
X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
|
282 |
+
X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
|
283 |
+
|
284 |
+
if not for_keras:
|
285 |
+
X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
|
286 |
+
|
287 |
+
# Pad
|
288 |
+
X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
|
289 |
+
X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')
|
290 |
+
|
291 |
+
if not for_keras:
|
292 |
+
X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
|
293 |
+
|
294 |
+
print(">>> Debug Split and Convert:")
|
295 |
+
print("X_train_padded.shape:", X_train_padded.shape)
|
296 |
+
print("X_val_padded.shape: ", X_val_padded.shape if not for_keras else "N/A")
|
297 |
+
print("X_test_padded.shape: ", X_test_padded.shape)
|
298 |
+
print("y_train length:", len(y_train))
|
299 |
+
print("y_val length: ", len(y_val) if not for_keras else "N/A")
|
300 |
+
print("y_test length: ", len(y_test))
|
301 |
+
print("vocab_size:", len(self.vocabulary))
|
302 |
+
|
303 |
+
if for_keras:
|
304 |
+
num_classes = len(label_mapping)
|
305 |
+
y_train_onehot = tf.keras.utils.to_categorical(
|
306 |
+
y_train,
|
307 |
+
num_classes=num_classes
|
308 |
+
)
|
309 |
+
y_test_onehot = tf.keras.utils.to_categorical(
|
310 |
+
y_test,
|
311 |
+
num_classes=num_classes
|
312 |
+
)
|
313 |
+
|
314 |
+
print("y_train_onehot.shape:", y_train_onehot.shape)
|
315 |
+
print("y_test_onehot.shape: ", y_test_onehot.shape)
|
316 |
+
|
317 |
+
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
|
318 |
+
else:
|
319 |
+
# Convert validation set
|
320 |
+
X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
|
321 |
+
X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
|
322 |
+
|
323 |
+
X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
|
324 |
+
X_val_t = torch.tensor(X_val_padded, dtype=torch.long)
|
325 |
+
X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
|
326 |
+
y_train_t = torch.tensor(y_train, dtype=torch.long)
|
327 |
+
y_val_t = torch.tensor(y_val, dtype=torch.long)
|
328 |
+
y_test_t = torch.tensor(y_test, dtype=torch.long)
|
329 |
+
|
330 |
+
train_ds = TensorDataset(X_train_t, y_train_t)
|
331 |
+
val_ds = TensorDataset(X_val_t, y_val_t)
|
332 |
+
test_ds = TensorDataset(X_test_t, y_test_t)
|
333 |
+
|
334 |
+
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
335 |
+
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
|
336 |
+
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
|
337 |
+
|
338 |
+
return train_loader, val_loader, test_loader, label_mapping
|
339 |
+
|
340 |
+
# ========== MÔ HÌNH KERAS BI-LSTM ==========
|
341 |
+
|
342 |
+
def predict_emotion_bilstm(model, text, data_manager, label_mapping):
|
343 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
344 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
345 |
+
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
|
346 |
+
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
|
347 |
+
output = model.predict(text_padded)
|
348 |
+
pred = output.argmax(axis=1)[0]
|
349 |
+
rev_map = {v: k for k, v in label_mapping.items()}
|
350 |
+
return rev_map[pred]
|
351 |
+
|
352 |
+
# ========== MAIN ==========
|
353 |
+
|
354 |
+
if __name__ == "__main__":
|
355 |
+
from keras.models import Model
|
356 |
+
from keras.layers import (
|
357 |
+
Input, Embedding, Dense, Dropout, Bidirectional, LSTM
|
358 |
+
)
|
359 |
+
from keras.optimizers import Adam
|
360 |
+
from keras.callbacks import ModelCheckpoint, EarlyStopping
|
361 |
+
|
362 |
+
# -------- ĐƯỜNG DẪN ----------
|
363 |
+
file_path = "train.xlsx"
|
364 |
+
abbreviations_path = "abbreviations.json"
|
365 |
+
word2vec_path = "word2vec_vi_syllables_100dims.txt"
|
366 |
+
output_path = "processed.xlsx"
|
367 |
+
|
368 |
+
# Khởi tạo DataManager
|
369 |
+
data_manager = DataManager(
|
370 |
+
file_path=file_path,
|
371 |
+
abbreviations_path=abbreviations_path,
|
372 |
+
word2vec_path=word2vec_path
|
373 |
+
)
|
374 |
+
|
375 |
+
# 1) Tiền xử lý, tạo vocab, load word2vec
|
376 |
+
df = data_manager.preprocess_data()
|
377 |
+
print("Trước khi cân bằng lớp (undersampling/oversampling):")
|
378 |
+
print(df["Emotion"].value_counts())
|
379 |
+
|
380 |
+
# 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
|
381 |
+
# Bạn có thể điều chỉnh theo nhu cầu của mình
|
382 |
+
df_enjoyment = df[df["Emotion"] == "Enjoyment"]
|
383 |
+
df_other = df[df["Emotion"] == "Other"]
|
384 |
+
df_anger = df[df["Emotion"] == "Anger"]
|
385 |
+
df_sadness = df[df["Emotion"] == "Sadness"]
|
386 |
+
df_disgust = df[df["Emotion"] == "Disgust"]
|
387 |
+
df_fear = df[df["Emotion"] == "Fear"]
|
388 |
+
df_surprise = df[df["Emotion"] == "Surprise"]
|
389 |
+
|
390 |
+
# Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
|
391 |
+
if len(df_other) < 3000:
|
392 |
+
df_other_oversampled = resample(
|
393 |
+
df_other,
|
394 |
+
replace=True,
|
395 |
+
n_samples=3000,
|
396 |
+
random_state=42
|
397 |
+
)
|
398 |
+
else:
|
399 |
+
df_other_oversampled = df_other
|
400 |
+
|
401 |
+
# Giữ nguyên các lớp khác (hoặc oversample tùy ý)
|
402 |
+
df_balanced = pd.concat([
|
403 |
+
df_enjoyment,
|
404 |
+
df_other_oversampled,
|
405 |
+
df_anger,
|
406 |
+
df_sadness,
|
407 |
+
df_disgust,
|
408 |
+
df_fear,
|
409 |
+
df_surprise
|
410 |
+
], axis=0)
|
411 |
+
|
412 |
+
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
413 |
+
df = df_balanced
|
414 |
+
|
415 |
+
print("\nSau khi cân bằng lớp (demo oversample):")
|
416 |
+
print(df["Emotion"].value_counts())
|
417 |
+
|
418 |
+
# Xuất file (nếu muốn)
|
419 |
+
df.to_excel(output_path, index=False)
|
420 |
+
|
421 |
+
# ========== TRAIN BI-LSTM KERAS ==========
|
422 |
+
|
423 |
+
print("\n========== Training Keras BiLSTM ==========")
|
424 |
+
|
425 |
+
# Tạo embedding pretrained cho Keras
|
426 |
+
pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
|
427 |
+
pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
|
428 |
+
|
429 |
+
# Split data for Keras
|
430 |
+
X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
|
431 |
+
df, label_column="Emotion", maxlen=400,
|
432 |
+
test_size=0.2, for_keras=True
|
433 |
+
)
|
434 |
+
|
435 |
+
num_classes = len(label_mapping)
|
436 |
+
input_dim = len(data_manager.vocabulary)
|
437 |
+
embedding_dim = pretrained_matrix.shape[1]
|
438 |
+
maxlen = 400
|
439 |
+
|
440 |
+
# Define BiLSTM Model
|
441 |
+
def create_bilstm_model():
|
442 |
+
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
|
443 |
+
emb_layer = Embedding(
|
444 |
+
input_dim=input_dim,
|
445 |
+
output_dim=embedding_dim,
|
446 |
+
weights=[pretrained_matrix_keras],
|
447 |
+
input_length=maxlen,
|
448 |
+
trainable=True # Set to False nếu bạn không muốn fine-tune embeddings
|
449 |
+
)(input_layer)
|
450 |
+
|
451 |
+
bilstm = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5))(emb_layer)
|
452 |
+
dense1 = Dense(64, activation='relu')(bilstm)
|
453 |
+
dropout1 = Dropout(0.5)(dense1)
|
454 |
+
dense2 = Dense(32, activation='relu')(dropout1)
|
455 |
+
dropout2 = Dropout(0.5)(dense2)
|
456 |
+
output_layer = Dense(num_classes, activation='softmax')(dropout2)
|
457 |
+
|
458 |
+
model = Model(inputs=input_layer, outputs=output_layer)
|
459 |
+
model.compile(
|
460 |
+
loss='categorical_crossentropy',
|
461 |
+
optimizer=Adam(lr=1e-3),
|
462 |
+
metrics=['accuracy']
|
463 |
+
)
|
464 |
+
return model
|
465 |
+
|
466 |
+
# Create model
|
467 |
+
model_bilstm = create_bilstm_model()
|
468 |
+
model_bilstm.summary()
|
469 |
+
|
470 |
+
# Define callbacks
|
471 |
+
checkpoint = ModelCheckpoint(
|
472 |
+
'bilstm_best.keras',
|
473 |
+
save_best_only=True,
|
474 |
+
monitor='val_accuracy',
|
475 |
+
mode='max'
|
476 |
+
)
|
477 |
+
early_stopping = EarlyStopping(
|
478 |
+
monitor='val_accuracy',
|
479 |
+
patience=5,
|
480 |
+
restore_best_weights=True
|
481 |
+
)
|
482 |
+
|
483 |
+
# Train model
|
484 |
+
history = model_bilstm.fit(
|
485 |
+
X_train, y_train,
|
486 |
+
validation_data=(X_test, y_test),
|
487 |
+
epochs=100,
|
488 |
+
batch_size=32,
|
489 |
+
callbacks=[checkpoint, early_stopping]
|
490 |
+
)
|
491 |
+
|
492 |
+
# Đánh giá trên test set với detailed metrics
|
493 |
+
loss, acc = model_bilstm.evaluate(X_test, y_test)
|
494 |
+
print(f"BiLSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
|
495 |
+
|
496 |
+
# Thu thập dự đoán và tính toán các chỉ số
|
497 |
+
y_pred_bilstm = model_bilstm.predict(X_test)
|
498 |
+
y_pred_bilstm = np.argmax(y_pred_bilstm, axis=1)
|
499 |
+
y_true_bilstm = np.argmax(y_test, axis=1)
|
500 |
+
|
501 |
+
test_accuracy_bilstm = accuracy_score(y_true_bilstm, y_pred_bilstm)
|
502 |
+
precision_macro_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
|
503 |
+
precision_weighted_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
|
504 |
+
recall_macro_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
|
505 |
+
recall_weighted_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
|
506 |
+
f1_macro_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
|
507 |
+
f1_weighted_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
|
508 |
+
report_bilstm = classification_report(y_true_bilstm, y_pred_bilstm, target_names=label_mapping.keys(), digits=4)
|
509 |
+
conf_matrix_bilstm = confusion_matrix(y_true_bilstm, y_pred_bilstm)
|
510 |
+
|
511 |
+
# In các chỉ số
|
512 |
+
print(f"\nBiLSTM Test Accuracy: {test_accuracy_bilstm:.4f}")
|
513 |
+
print(f"Precision (Macro): {precision_macro_bilstm:.4f}")
|
514 |
+
print(f"Precision (Weighted): {precision_weighted_bilstm:.4f}")
|
515 |
+
print(f"Recall (Macro): {recall_macro_bilstm:.4f}")
|
516 |
+
print(f"Recall (Weighted): {recall_weighted_bilstm:.4f}")
|
517 |
+
print(f"F1-Score (Macro): {f1_macro_bilstm:.4f}")
|
518 |
+
print(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}")
|
519 |
+
|
520 |
+
print("\n========== BiLSTM Classification Report ==========")
|
521 |
+
print(report_bilstm)
|
522 |
+
|
523 |
+
print("\n========== BiLSTM Confusion Matrix ==========")
|
524 |
+
print(conf_matrix_bilstm)
|
525 |
+
|
526 |
+
# Lưu báo cáo vào file
|
527 |
+
bilstm_report_dir = "bilstm_emotion_model"
|
528 |
+
os.makedirs(bilstm_report_dir, exist_ok=True)
|
529 |
+
with open(os.path.join(bilstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
|
530 |
+
f.write("========== BiLSTM Classification Report ==========\n")
|
531 |
+
f.write(report_bilstm)
|
532 |
+
f.write("\n========== Additional Metrics ==========\n")
|
533 |
+
f.write(f"Test Loss: {loss:.4f}\n")
|
534 |
+
f.write(f"Test Accuracy: {test_accuracy_bilstm:.4f}\n")
|
535 |
+
f.write(f"Precision (Macro): {precision_macro_bilstm:.4f}\n")
|
536 |
+
f.write(f"Precision (Weighted): {precision_weighted_bilstm:.4f}\n")
|
537 |
+
f.write(f"Recall (Macro): {recall_macro_bilstm:.4f}\n")
|
538 |
+
f.write(f"Recall (Weighted): {recall_weighted_bilstm:.4f}\n")
|
539 |
+
f.write(f"F1-Score (Macro): {f1_macro_bilstm:.4f}\n")
|
540 |
+
f.write(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}\n")
|
541 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
542 |
+
f.write(np.array2string(conf_matrix_bilstm))
|
543 |
+
|
544 |
+
print("\n========== BiLSTM Classification Report saved to 'bilstm_emotion_model/classification_report.txt' ==========")
|
545 |
+
|
546 |
+
# Lưu mô hình BiLSTM
|
547 |
+
model_bilstm.save(os.path.join(bilstm_report_dir, 'bilstm_model.keras'))
|
548 |
+
print(f"========== BiLSTM Model saved to '{bilstm_report_dir}/bilstm_model.keras' ==========")
|
549 |
+
|
550 |
+
# ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
|
551 |
+
|
552 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
553 |
+
|
554 |
+
# BiLSTM (Keras)
|
555 |
+
emotion_bilstm = predict_emotion_bilstm(
|
556 |
+
model_bilstm, custom_text, data_manager, label_mapping
|
557 |
+
)
|
558 |
+
print(f"Predicted Emotion (BiLSTM): {emotion_bilstm}")
|
559 |
+
|
560 |
+
# Kiểm tra TF, GPU
|
561 |
+
print("TF version:", tf.__version__)
|
562 |
+
print("GPU devices:", tf.config.list_physical_devices("GPU"))
|
563 |
+
# os.system("nvidia-smi") # nếu muốn xem info GPU
|
564 |
+
|
565 |
+
# ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
|
566 |
+
# Lưu label_mapping và vocabulary cho BiLSTM
|
567 |
+
with open(os.path.join(bilstm_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
|
568 |
+
json.dump(label_mapping, f, ensure_ascii=False, indent=4)
|
569 |
+
|
570 |
+
with open(os.path.join(bilstm_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
|
571 |
+
json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
|
572 |
+
|
573 |
+
print("========== Label Mapping and Vocabulary saved ==========")
|
main_RNN_CNN-LSTM.py
ADDED
@@ -0,0 +1,738 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# thesis.py
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
import numpy as np
|
9 |
+
from underthesea import word_tokenize
|
10 |
+
from tqdm import tqdm
|
11 |
+
import torch
|
12 |
+
from torchtext.vocab import Vectors
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from sklearn.utils import resample
|
15 |
+
from sklearn.metrics import (
|
16 |
+
accuracy_score,
|
17 |
+
classification_report,
|
18 |
+
precision_score,
|
19 |
+
recall_score,
|
20 |
+
f1_score,
|
21 |
+
confusion_matrix
|
22 |
+
)
|
23 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
24 |
+
from torch.utils.data import DataLoader, TensorDataset
|
25 |
+
import torch.nn as nn
|
26 |
+
import torch.optim as optim
|
27 |
+
import tensorflow as tf
|
28 |
+
import os
|
29 |
+
import joblib
|
30 |
+
|
31 |
+
# ========== CÁC HÀM TIỀN XỬ LÝ ==========
|
32 |
+
|
33 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
34 |
+
"""
|
35 |
+
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
|
36 |
+
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
|
37 |
+
"""
|
38 |
+
sentence = sentence.lower()
|
39 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
40 |
+
sentence = remove_profanity(sentence)
|
41 |
+
sentence = remove_special_characters(sentence)
|
42 |
+
sentence = normalize_whitespace(sentence)
|
43 |
+
sentence = replace_abbreviations(sentence, abbreviations)
|
44 |
+
sentence = remove_repeated_characters(sentence)
|
45 |
+
sentence = replace_numbers(sentence)
|
46 |
+
sentence = tokenize_sentence(sentence)
|
47 |
+
return sentence
|
48 |
+
|
49 |
+
def replace_emojis(sentence, emoji_mapping):
|
50 |
+
processed_sentence = []
|
51 |
+
for char in sentence:
|
52 |
+
if char in emoji_mapping:
|
53 |
+
processed_sentence.append(emoji_mapping[char])
|
54 |
+
elif not emoji.is_emoji(char):
|
55 |
+
processed_sentence.append(char)
|
56 |
+
return ''.join(processed_sentence)
|
57 |
+
|
58 |
+
def remove_profanity(sentence):
|
59 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
60 |
+
words = sentence.split()
|
61 |
+
filtered_words = [word for word in words if word.lower() not in profane_words]
|
62 |
+
return ' '.join(filtered_words)
|
63 |
+
|
64 |
+
def remove_special_characters(sentence):
|
65 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
66 |
+
|
67 |
+
def normalize_whitespace(sentence):
|
68 |
+
return ' '.join(sentence.split())
|
69 |
+
|
70 |
+
def replace_abbreviations(sentence, abbreviations):
|
71 |
+
words = sentence.split()
|
72 |
+
replaced_words = [
|
73 |
+
" ".join(abbreviations[word]) if word in abbreviations else word
|
74 |
+
for word in words
|
75 |
+
]
|
76 |
+
return ' '.join(replaced_words)
|
77 |
+
|
78 |
+
def remove_repeated_characters(sentence):
|
79 |
+
# Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
|
80 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
81 |
+
|
82 |
+
def replace_numbers(sentence):
|
83 |
+
# Thay toàn bộ số bằng token [number]
|
84 |
+
return re.sub(r"\d+", "[number]", sentence)
|
85 |
+
|
86 |
+
def tokenize_sentence(sentence):
|
87 |
+
# Tách từ bằng underthesea
|
88 |
+
return ' '.join(word_tokenize(sentence))
|
89 |
+
|
90 |
+
|
91 |
+
# ========== VOCABULARY CLASS ==========
|
92 |
+
|
93 |
+
class Vocabulary:
|
94 |
+
def __init__(self):
|
95 |
+
self.word2id = {}
|
96 |
+
self.word2id['<pad>'] = 0
|
97 |
+
self.word2id['<unk>'] = 1
|
98 |
+
self.unk_id = 1
|
99 |
+
self.id2word = {0: '<pad>', 1: '<unk>'}
|
100 |
+
|
101 |
+
def __getitem__(self, word):
|
102 |
+
return self.word2id.get(word, self.unk_id)
|
103 |
+
|
104 |
+
def __contains__(self, word):
|
105 |
+
return word in self.word2id
|
106 |
+
|
107 |
+
def __len__(self):
|
108 |
+
return len(self.word2id)
|
109 |
+
|
110 |
+
def lookup_tokens(self, indices):
|
111 |
+
return [self.id2word[idx] for idx in indices]
|
112 |
+
|
113 |
+
def add(self, word):
|
114 |
+
if word not in self.word2id:
|
115 |
+
idx = len(self.word2id)
|
116 |
+
self.word2id[word] = idx
|
117 |
+
self.id2word[idx] = word
|
118 |
+
|
119 |
+
@staticmethod
|
120 |
+
def tokenize_corpus(corpus):
|
121 |
+
tokenized_corpus = []
|
122 |
+
for doc in tqdm(corpus, desc="Tokenizing Corpus"):
|
123 |
+
tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
|
124 |
+
tokenized_corpus.append(tokens)
|
125 |
+
return tokenized_corpus
|
126 |
+
|
127 |
+
def corpus_to_tensor(self, corpus, is_tokenized=False):
|
128 |
+
"""
|
129 |
+
corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
|
130 |
+
return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
|
131 |
+
"""
|
132 |
+
tokenized_corpus = (
|
133 |
+
self.tokenize_corpus(corpus) if not is_tokenized else corpus
|
134 |
+
)
|
135 |
+
return [
|
136 |
+
[self[token] for token in doc]
|
137 |
+
for doc in tokenized_corpus
|
138 |
+
]
|
139 |
+
|
140 |
+
|
141 |
+
# ========== EMOJI MAPPING ==========
|
142 |
+
|
143 |
+
emoji_mapping = {
|
144 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
145 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
146 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
147 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
148 |
+
"🤑": "[satisfaction]",
|
149 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
150 |
+
"😏": "[sarcasm]",
|
151 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
152 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
153 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
154 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
155 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
156 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
157 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
158 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
159 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
160 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
161 |
+
}
|
162 |
+
|
163 |
+
def load_abbreviations(path):
|
164 |
+
with open(path, "r", encoding="utf-8") as f:
|
165 |
+
return json.load(f)
|
166 |
+
|
167 |
+
|
168 |
+
# ========== DATA MANAGER ==========
|
169 |
+
|
170 |
+
class DataManager:
|
171 |
+
def __init__(self, file_path, abbreviations_path, word2vec_path):
|
172 |
+
self.file_path = file_path
|
173 |
+
self.abbreviations_path = abbreviations_path
|
174 |
+
self.word2vec_path = word2vec_path
|
175 |
+
self.vocabulary = None
|
176 |
+
self.word_embeddings = None
|
177 |
+
self.abbreviations = None
|
178 |
+
self.load_abbreviations()
|
179 |
+
|
180 |
+
def load_abbreviations(self):
|
181 |
+
with open(self.abbreviations_path, "r", encoding="utf-8") as f:
|
182 |
+
self.abbreviations = json.load(f)
|
183 |
+
|
184 |
+
def load_word2vec(self):
|
185 |
+
"""
|
186 |
+
Tải vector từ file word2vec,
|
187 |
+
dùng torchtext.Vectors để load embedding pretrained.
|
188 |
+
"""
|
189 |
+
self.word_embeddings = Vectors(
|
190 |
+
name=self.word2vec_path,
|
191 |
+
unk_init=torch.Tensor.normal_
|
192 |
+
)
|
193 |
+
|
194 |
+
def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
|
195 |
+
"""
|
196 |
+
Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
|
197 |
+
"""
|
198 |
+
vocab = Vocabulary()
|
199 |
+
from collections import Counter
|
200 |
+
counter = Counter()
|
201 |
+
|
202 |
+
for sent in corpus:
|
203 |
+
for token in sent.split():
|
204 |
+
counter[token] += 1
|
205 |
+
|
206 |
+
most_common = counter.most_common(max_vocab_size)
|
207 |
+
for word, _freq in most_common:
|
208 |
+
vocab.add(word)
|
209 |
+
|
210 |
+
return vocab
|
211 |
+
|
212 |
+
def preprocess_data(self):
|
213 |
+
df = pd.read_excel(self.file_path)
|
214 |
+
if "Sentence" not in df.columns:
|
215 |
+
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
|
216 |
+
|
217 |
+
# Tiền xử lý từng câu
|
218 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
219 |
+
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
|
220 |
+
)
|
221 |
+
|
222 |
+
# Loại những dòng rỗng
|
223 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
224 |
+
|
225 |
+
# Tạo vocab từ chính dữ liệu
|
226 |
+
all_sentences = df["processed_sentence"].tolist()
|
227 |
+
self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
|
228 |
+
|
229 |
+
# Load word2vec
|
230 |
+
self.load_word2vec()
|
231 |
+
|
232 |
+
return df
|
233 |
+
|
234 |
+
def build_pretrained_embedding_matrix(self, embedding_dim=100):
|
235 |
+
"""
|
236 |
+
Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
|
237 |
+
với trọng số pretrained.
|
238 |
+
"""
|
239 |
+
vocab_size = len(self.vocabulary)
|
240 |
+
weight_matrix = np.random.normal(
|
241 |
+
scale=0.1, size=(vocab_size, embedding_dim)
|
242 |
+
).astype(np.float32)
|
243 |
+
|
244 |
+
# Copy vector pretrained
|
245 |
+
for word, idx in self.vocabulary.word2id.items():
|
246 |
+
if word in self.word_embeddings.stoi:
|
247 |
+
weight_matrix[idx] = self.word_embeddings.vectors[
|
248 |
+
self.word_embeddings.stoi[word]
|
249 |
+
]
|
250 |
+
|
251 |
+
return weight_matrix
|
252 |
+
|
253 |
+
def split_and_convert(
|
254 |
+
self, df, label_column="Emotion", maxlen=400, test_size=0.2,
|
255 |
+
for_keras=False, batch_size=32
|
256 |
+
):
|
257 |
+
"""
|
258 |
+
Chia dữ liệu thành train/test.
|
259 |
+
- for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
|
260 |
+
- for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
|
261 |
+
"""
|
262 |
+
if label_column not in df.columns:
|
263 |
+
raise ValueError(
|
264 |
+
f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
|
265 |
+
)
|
266 |
+
|
267 |
+
# Tạo mapping nhãn -> số
|
268 |
+
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
|
269 |
+
df[label_column] = df[label_column].map(label_mapping)
|
270 |
+
if df[label_column].isnull().any():
|
271 |
+
missing = df[df[label_column].isnull()][label_column].unique()
|
272 |
+
raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
|
273 |
+
|
274 |
+
X = df["processed_sentence"].tolist()
|
275 |
+
y = df[label_column].tolist()
|
276 |
+
|
277 |
+
# Stratify to maintain class distribution
|
278 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
279 |
+
X, y, test_size=test_size, random_state=42, stratify=y
|
280 |
+
)
|
281 |
+
|
282 |
+
# Convert text -> index
|
283 |
+
X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
|
284 |
+
X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
|
285 |
+
|
286 |
+
# Pad
|
287 |
+
X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
|
288 |
+
X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')
|
289 |
+
|
290 |
+
print(">>> Debug Split and Convert:")
|
291 |
+
print("X_train_padded.shape:", X_train_padded.shape)
|
292 |
+
print("X_test_padded.shape: ", X_test_padded.shape)
|
293 |
+
print("y_train length:", len(y_train))
|
294 |
+
print("y_test length: ", len(y_test))
|
295 |
+
print("vocab_size:", len(self.vocabulary))
|
296 |
+
|
297 |
+
if for_keras:
|
298 |
+
num_classes = len(label_mapping)
|
299 |
+
y_train_onehot = torch.nn.functional.one_hot(
|
300 |
+
torch.tensor(y_train),
|
301 |
+
num_classes=num_classes
|
302 |
+
).numpy()
|
303 |
+
y_test_onehot = torch.nn.functional.one_hot(
|
304 |
+
torch.tensor(y_test),
|
305 |
+
num_classes=num_classes
|
306 |
+
).numpy()
|
307 |
+
|
308 |
+
print("y_train_onehot.shape:", y_train_onehot.shape)
|
309 |
+
print("y_test_onehot.shape: ", y_test_onehot.shape)
|
310 |
+
|
311 |
+
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
|
312 |
+
else:
|
313 |
+
# Trả về DataLoader
|
314 |
+
X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
|
315 |
+
X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
|
316 |
+
y_train_t = torch.tensor(y_train, dtype=torch.long)
|
317 |
+
y_test_t = torch.tensor(y_test, dtype=torch.long)
|
318 |
+
|
319 |
+
train_ds = TensorDataset(X_train_t, y_train_t)
|
320 |
+
test_ds = TensorDataset(X_test_t, y_test_t)
|
321 |
+
|
322 |
+
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
323 |
+
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
|
324 |
+
|
325 |
+
return train_loader, test_loader, label_mapping
|
326 |
+
|
327 |
+
|
328 |
+
# ========== MÔ HÌNH PYTORCH RNN ==========
|
329 |
+
|
330 |
+
class SimpleRNN(nn.Module):
|
331 |
+
def __init__(self, pretrained_weight, hidden_dim, output_dim, dropout=0.3):
|
332 |
+
super(SimpleRNN, self).__init__()
|
333 |
+
vocab_size, embedding_dim = pretrained_weight.shape
|
334 |
+
# Tạo nn.Embedding từ pretrained_weight
|
335 |
+
self.embedding = nn.Embedding.from_pretrained(
|
336 |
+
torch.from_numpy(pretrained_weight),
|
337 |
+
freeze=False # True nếu muốn cố định embedding
|
338 |
+
)
|
339 |
+
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
|
340 |
+
self.dropout = nn.Dropout(dropout)
|
341 |
+
self.fc = nn.Linear(hidden_dim, output_dim)
|
342 |
+
|
343 |
+
def forward(self, x):
|
344 |
+
embedded = self.dropout(self.embedding(x))
|
345 |
+
_, (hidden, _) = self.rnn(embedded)
|
346 |
+
hidden = self.dropout(hidden.squeeze(0))
|
347 |
+
output = self.fc(hidden)
|
348 |
+
return output
|
349 |
+
|
350 |
+
|
351 |
+
def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
|
352 |
+
model.eval()
|
353 |
+
with torch.no_grad():
|
354 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
355 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
356 |
+
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
|
357 |
+
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
|
358 |
+
text_tensor = torch.tensor(
|
359 |
+
text_padded,
|
360 |
+
dtype=torch.long
|
361 |
+
).to(device)
|
362 |
+
|
363 |
+
output = model(text_tensor)
|
364 |
+
_, predicted = torch.max(output, 1)
|
365 |
+
rev_map = {v: k for k, v in label_mapping.items()}
|
366 |
+
return rev_map[predicted.item()]
|
367 |
+
|
368 |
+
|
369 |
+
# ========== MÔ HÌNH KERAS CNN-LSTM ==========
|
370 |
+
|
371 |
+
def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
|
372 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
373 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
374 |
+
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
|
375 |
+
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
|
376 |
+
output = model.predict(text_padded)
|
377 |
+
pred = output.argmax(axis=1)[0]
|
378 |
+
rev_map = {v: k for k, v in label_mapping.items()}
|
379 |
+
return rev_map[pred]
|
380 |
+
|
381 |
+
|
382 |
+
# ========== MAIN ==========
|
383 |
+
|
384 |
+
if __name__ == "__main__":
|
385 |
+
from keras.models import Model
|
386 |
+
from keras.layers import (
|
387 |
+
Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
|
388 |
+
)
|
389 |
+
from keras.optimizers import Adam
|
390 |
+
from keras.callbacks import ModelCheckpoint, EarlyStopping
|
391 |
+
|
392 |
+
# -------- ĐƯỜNG DẪN ----------
|
393 |
+
file_path = "train.xlsx"
|
394 |
+
abbreviations_path = "abbreviations.json"
|
395 |
+
word2vec_path = "word2vec_vi_syllables_100dims.txt"
|
396 |
+
output_path = "processed.xlsx"
|
397 |
+
|
398 |
+
# Khởi tạo DataManager
|
399 |
+
data_manager = DataManager(
|
400 |
+
file_path=file_path,
|
401 |
+
abbreviations_path=abbreviations_path,
|
402 |
+
word2vec_path=word2vec_path
|
403 |
+
)
|
404 |
+
|
405 |
+
# 1) Tiền xử lý, tạo vocab, load word2vec
|
406 |
+
df = data_manager.preprocess_data()
|
407 |
+
print("Trước khi cân bằng lớp (undersampling/oversampling):")
|
408 |
+
print(df["Emotion"].value_counts())
|
409 |
+
|
410 |
+
# 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
|
411 |
+
# Bạn có thể điều chỉnh theo nhu cầu của mình
|
412 |
+
df_enjoyment = df[df["Emotion"] == "Enjoyment"]
|
413 |
+
df_other = df[df["Emotion"] == "Other"]
|
414 |
+
df_anger = df[df["Emotion"] == "Anger"]
|
415 |
+
df_sadness = df[df["Emotion"] == "Sadness"]
|
416 |
+
df_disgust = df[df["Emotion"] == "Disgust"]
|
417 |
+
df_fear = df[df["Emotion"] == "Fear"]
|
418 |
+
df_surprise = df[df["Emotion"] == "Surprise"]
|
419 |
+
|
420 |
+
# Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
|
421 |
+
if len(df_other) < 3000:
|
422 |
+
df_other_oversampled = resample(
|
423 |
+
df_other,
|
424 |
+
replace=True,
|
425 |
+
n_samples=3000,
|
426 |
+
random_state=42
|
427 |
+
)
|
428 |
+
else:
|
429 |
+
df_other_oversampled = df_other
|
430 |
+
|
431 |
+
# Giữ nguyên các lớp khác (hoặc oversample tùy ý)
|
432 |
+
df_balanced = pd.concat([
|
433 |
+
df_enjoyment,
|
434 |
+
df_other_oversampled,
|
435 |
+
df_anger,
|
436 |
+
df_sadness,
|
437 |
+
df_disgust,
|
438 |
+
df_fear,
|
439 |
+
df_surprise
|
440 |
+
], axis=0)
|
441 |
+
|
442 |
+
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
443 |
+
df = df_balanced
|
444 |
+
|
445 |
+
print("\nSau khi cân bằng lớp (demo oversample):")
|
446 |
+
print(df["Emotion"].value_counts())
|
447 |
+
|
448 |
+
# Xuất file (nếu muốn)
|
449 |
+
df.to_excel(output_path, index=False)
|
450 |
+
|
451 |
+
# ========== TRAIN RNN PYTORCH ==========
|
452 |
+
|
453 |
+
print("\n========== Training PyTorch SimpleRNN ==========")
|
454 |
+
|
455 |
+
# Xây ma trận embedding pretrained
|
456 |
+
pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
|
457 |
+
|
458 |
+
# Chia và chuyển đổi dữ liệu thành DataLoader
|
459 |
+
train_loader, test_loader, label_mapping = data_manager.split_and_convert(
|
460 |
+
df, label_column="Emotion", maxlen=400, test_size=0.2,
|
461 |
+
for_keras=False, batch_size=32
|
462 |
+
)
|
463 |
+
|
464 |
+
hidden_dim = 128
|
465 |
+
output_dim = len(label_mapping)
|
466 |
+
|
467 |
+
model_rnn = SimpleRNN(pretrained_weight=pretrained_matrix,
|
468 |
+
hidden_dim=hidden_dim,
|
469 |
+
output_dim=output_dim,
|
470 |
+
dropout=0.3)
|
471 |
+
criterion = nn.CrossEntropyLoss()
|
472 |
+
optimizer = optim.Adam(model_rnn.parameters(), lr=1e-3)
|
473 |
+
|
474 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
475 |
+
model_rnn.to(device)
|
476 |
+
|
477 |
+
num_epochs = 20
|
478 |
+
for epoch in range(num_epochs):
|
479 |
+
model_rnn.train()
|
480 |
+
epoch_loss = 0
|
481 |
+
correct = 0
|
482 |
+
total = 0
|
483 |
+
|
484 |
+
for X_batch, y_batch in train_loader:
|
485 |
+
X_batch = X_batch.to(device)
|
486 |
+
y_batch = y_batch.to(device)
|
487 |
+
|
488 |
+
optimizer.zero_grad()
|
489 |
+
preds = model_rnn(X_batch)
|
490 |
+
loss = criterion(preds, y_batch)
|
491 |
+
loss.backward()
|
492 |
+
optimizer.step()
|
493 |
+
|
494 |
+
epoch_loss += loss.item()
|
495 |
+
_, pred_label = torch.max(preds, 1)
|
496 |
+
correct += (pred_label == y_batch).sum().item()
|
497 |
+
total += y_batch.size(0)
|
498 |
+
|
499 |
+
epoch_accuracy = correct / total
|
500 |
+
epoch_loss_avg = epoch_loss / len(train_loader)
|
501 |
+
print(f"Epoch {epoch+1}/{num_epochs}, "
|
502 |
+
f"Loss: {epoch_loss_avg:.4f}, "
|
503 |
+
f"Accuracy: {epoch_accuracy:.4f}")
|
504 |
+
|
505 |
+
# Đánh giá trên test set với detailed metrics
|
506 |
+
model_rnn.eval()
|
507 |
+
test_loss = 0
|
508 |
+
correct = 0
|
509 |
+
total = 0
|
510 |
+
y_true = []
|
511 |
+
y_pred = []
|
512 |
+
with torch.no_grad():
|
513 |
+
for X_batch, y_batch in test_loader:
|
514 |
+
X_batch = X_batch.to(device)
|
515 |
+
y_batch = y_batch.to(device)
|
516 |
+
preds = model_rnn(X_batch)
|
517 |
+
loss = criterion(preds, y_batch)
|
518 |
+
test_loss += loss.item()
|
519 |
+
|
520 |
+
_, predicted = torch.max(preds, 1)
|
521 |
+
correct += (predicted == y_batch).sum().item()
|
522 |
+
total += y_batch.size(0)
|
523 |
+
|
524 |
+
y_true.extend(y_batch.cpu().numpy())
|
525 |
+
y_pred.extend(predicted.cpu().numpy())
|
526 |
+
|
527 |
+
test_accuracy = accuracy_score(y_true, y_pred)
|
528 |
+
test_loss_avg = test_loss / len(test_loader)
|
529 |
+
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
|
530 |
+
precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
|
531 |
+
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
|
532 |
+
recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
|
533 |
+
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
|
534 |
+
f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
|
535 |
+
report = classification_report(y_true, y_pred, target_names=label_mapping.keys(), digits=4)
|
536 |
+
conf_matrix = confusion_matrix(y_true, y_pred)
|
537 |
+
|
538 |
+
# In các chỉ số
|
539 |
+
print(f"\nTest Loss: {test_loss_avg:.4f}, Test Accuracy: {test_accuracy:.4f}")
|
540 |
+
print(f"Precision (Macro): {precision_macro:.4f}")
|
541 |
+
print(f"Precision (Weighted): {precision_weighted:.4f}")
|
542 |
+
print(f"Recall (Macro): {recall_macro:.4f}")
|
543 |
+
print(f"Recall (Weighted): {recall_weighted:.4f}")
|
544 |
+
print(f"F1-Score (Macro): {f1_macro:.4f}")
|
545 |
+
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
|
546 |
+
|
547 |
+
print("\n========== Classification Report ==========")
|
548 |
+
print(report)
|
549 |
+
|
550 |
+
print("\n========== Confusion Matrix ==========")
|
551 |
+
print(conf_matrix)
|
552 |
+
|
553 |
+
# Lưu báo cáo vào file
|
554 |
+
rnn_report_dir = "rnn_emotion_model"
|
555 |
+
os.makedirs(rnn_report_dir, exist_ok=True)
|
556 |
+
with open(os.path.join(rnn_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
|
557 |
+
f.write("========== Classification Report ==========\n")
|
558 |
+
f.write(report)
|
559 |
+
f.write("\n========== Additional Metrics ==========\n")
|
560 |
+
f.write(f"Test Loss: {test_loss_avg:.4f}\n")
|
561 |
+
f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
|
562 |
+
f.write(f"Precision (Macro): {precision_macro:.4f}\n")
|
563 |
+
f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
|
564 |
+
f.write(f"Recall (Macro): {recall_macro:.4f}\n")
|
565 |
+
f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
|
566 |
+
f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
|
567 |
+
f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
|
568 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
569 |
+
f.write(np.array2string(conf_matrix))
|
570 |
+
|
571 |
+
print("\n========== Classification Report saved to 'rnn_emotion_model/classification_report.txt' ==========")
|
572 |
+
|
573 |
+
# Lưu mô hình RNN
|
574 |
+
torch.save(model_rnn.state_dict(), os.path.join(rnn_report_dir, "simple_rnn.pth"))
|
575 |
+
print("========== RNN Model saved to 'rnn_emotion_model/simple_rnn.pth' ==========")
|
576 |
+
|
577 |
+
# ========== TRAIN CNN-LSTM KERAS ==========
|
578 |
+
|
579 |
+
print("\n========== Training CNN-LSTM (Keras) ==========")
|
580 |
+
|
581 |
+
# Tạo embedding pretrained cho Keras
|
582 |
+
# Chúng ta có pretrained_matrix (num_vocab x 100)
|
583 |
+
# Sẽ truyền vào layer Embedding(..., weights=[...])
|
584 |
+
X_train_keras, X_test_keras, y_train_keras, y_test_keras, label_mapping_keras = data_manager.split_and_convert(
|
585 |
+
df, label_column="Emotion", maxlen=400, test_size=0.2,
|
586 |
+
for_keras=True
|
587 |
+
)
|
588 |
+
|
589 |
+
maxlen = 400
|
590 |
+
vocab_size, embedding_dim = pretrained_matrix.shape
|
591 |
+
|
592 |
+
# Chuyển pretrained_matrix -> float32 (đảm bảo Keras nhận dạng)
|
593 |
+
pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
|
594 |
+
|
595 |
+
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
|
596 |
+
emb_layer = Embedding(
|
597 |
+
input_dim=vocab_size,
|
598 |
+
output_dim=embedding_dim,
|
599 |
+
weights=[pretrained_matrix_keras],
|
600 |
+
trainable=True # True hoặc False tùy muốn fine-tune embedding
|
601 |
+
)(input_layer)
|
602 |
+
|
603 |
+
def max_1d(X):
|
604 |
+
return tf.reduce_max(X, axis=1)
|
605 |
+
|
606 |
+
con3 = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
|
607 |
+
pool_con3 = Lambda(max_1d, output_shape=(150,))(con3)
|
608 |
+
|
609 |
+
con5 = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
|
610 |
+
pool_con5 = Lambda(max_1d, output_shape=(150,))(con5)
|
611 |
+
|
612 |
+
lstm_out = LSTM(128, dropout=0.3)(emb_layer)
|
613 |
+
|
614 |
+
merged = concatenate([pool_con3, pool_con5, lstm_out])
|
615 |
+
dense = Dense(100, activation='relu')(merged)
|
616 |
+
drop = Dropout(0.3)(dense)
|
617 |
+
output = Dense(output_dim, activation='softmax')(drop)
|
618 |
+
|
619 |
+
model_cnn_lstm = Model(inputs=input_layer, outputs=output)
|
620 |
+
model_cnn_lstm.compile(
|
621 |
+
loss='categorical_crossentropy',
|
622 |
+
optimizer=Adam(lr=1e-3),
|
623 |
+
metrics=['accuracy']
|
624 |
+
)
|
625 |
+
|
626 |
+
checkpoint = ModelCheckpoint(
|
627 |
+
'cnn_lstm_best.keras',
|
628 |
+
save_best_only=True,
|
629 |
+
monitor='val_accuracy',
|
630 |
+
mode='max'
|
631 |
+
)
|
632 |
+
early_stopping = EarlyStopping(
|
633 |
+
monitor='val_accuracy',
|
634 |
+
patience=5,
|
635 |
+
restore_best_weights=True
|
636 |
+
)
|
637 |
+
|
638 |
+
history = model_cnn_lstm.fit(
|
639 |
+
X_train_keras, y_train_keras,
|
640 |
+
validation_data=(X_test_keras, y_test_keras),
|
641 |
+
epochs=30,
|
642 |
+
batch_size=32,
|
643 |
+
callbacks=[checkpoint, early_stopping]
|
644 |
+
)
|
645 |
+
|
646 |
+
# Đánh giá trên test set với detailed metrics
|
647 |
+
loss, acc = model_cnn_lstm.evaluate(X_test_keras, y_test_keras)
|
648 |
+
print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
|
649 |
+
|
650 |
+
# Thu thập dự đoán và tính toán các chỉ số
|
651 |
+
y_pred_cnn_lstm = model_cnn_lstm.predict(X_test_keras)
|
652 |
+
y_pred_cnn_lstm = np.argmax(y_pred_cnn_lstm, axis=1)
|
653 |
+
y_true_cnn_lstm = np.argmax(y_test_keras, axis=1)
|
654 |
+
|
655 |
+
test_accuracy_cnn_lstm = accuracy_score(y_true_cnn_lstm, y_pred_cnn_lstm)
|
656 |
+
precision_macro_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
|
657 |
+
precision_weighted_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
|
658 |
+
recall_macro_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
|
659 |
+
recall_weighted_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
|
660 |
+
f1_macro_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
|
661 |
+
f1_weighted_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
|
662 |
+
report_cnn_lstm = classification_report(y_true_cnn_lstm, y_pred_cnn_lstm, target_names=label_mapping.keys(), digits=4)
|
663 |
+
conf_matrix_cnn_lstm = confusion_matrix(y_true_cnn_lstm, y_pred_cnn_lstm)
|
664 |
+
|
665 |
+
# In các chỉ số
|
666 |
+
print(f"\nCNN-LSTM Test Accuracy: {test_accuracy_cnn_lstm:.4f}")
|
667 |
+
print(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}")
|
668 |
+
print(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}")
|
669 |
+
print(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}")
|
670 |
+
print(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}")
|
671 |
+
print(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}")
|
672 |
+
print(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}")
|
673 |
+
|
674 |
+
print("\n========== CNN-LSTM Classification Report ==========")
|
675 |
+
print(report_cnn_lstm)
|
676 |
+
|
677 |
+
print("\n========== CNN-LSTM Confusion Matrix ==========")
|
678 |
+
print(conf_matrix_cnn_lstm)
|
679 |
+
|
680 |
+
# Lưu báo cáo vào file
|
681 |
+
cnn_lstm_report_dir = "cnn_lstm_emotion_model"
|
682 |
+
os.makedirs(cnn_lstm_report_dir, exist_ok=True)
|
683 |
+
with open(os.path.join(cnn_lstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
|
684 |
+
f.write("========== CNN-LSTM Classification Report ==========\n")
|
685 |
+
f.write(report_cnn_lstm)
|
686 |
+
f.write("\n========== Additional Metrics ==========\n")
|
687 |
+
f.write(f"Test Loss: {loss:.4f}\n")
|
688 |
+
f.write(f"Test Accuracy: {test_accuracy_cnn_lstm:.4f}\n")
|
689 |
+
f.write(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}\n")
|
690 |
+
f.write(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}\n")
|
691 |
+
f.write(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}\n")
|
692 |
+
f.write(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}\n")
|
693 |
+
f.write(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}\n")
|
694 |
+
f.write(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}\n")
|
695 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
696 |
+
f.write(np.array2string(conf_matrix_cnn_lstm))
|
697 |
+
|
698 |
+
print("\n========== CNN-LSTM Classification Report saved to 'cnn_lstm_emotion_model/classification_report.txt' ==========")
|
699 |
+
|
700 |
+
# Lưu mô hình CNN-LSTM
|
701 |
+
model_cnn_lstm.save(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
|
702 |
+
print(f"========== CNN-LSTM Model saved to '{cnn_lstm_report_dir}/cnn_lstm_model.keras' ==========")
|
703 |
+
|
704 |
+
# ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
|
705 |
+
# Lưu label_mapping và vocabulary cho RNN
|
706 |
+
with open(os.path.join(rnn_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
|
707 |
+
json.dump(label_mapping, f, ensure_ascii=False, indent=4)
|
708 |
+
|
709 |
+
with open(os.path.join(rnn_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
|
710 |
+
json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
|
711 |
+
|
712 |
+
# Lưu label_mapping và vocabulary cho CNN-LSTM
|
713 |
+
# Giả sử label_mapping và vocabulary giống nhau, bạn có thể chỉ lưu một lần.
|
714 |
+
# Nếu khác, hãy điều chỉnh tương ứng.
|
715 |
+
|
716 |
+
print("========== Label Mapping and Vocabulary saved ==========")
|
717 |
+
|
718 |
+
# ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
|
719 |
+
|
720 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
721 |
+
|
722 |
+
# RNN (PyTorch)
|
723 |
+
emotion_rnn = predict_emotion_rnn(
|
724 |
+
model_rnn, custom_text, data_manager, label_mapping, device
|
725 |
+
)
|
726 |
+
print(f"Predicted Emotion (RNN): {emotion_rnn}")
|
727 |
+
|
728 |
+
# CNN-LSTM (Keras)
|
729 |
+
cnn_lstm_loaded = tf.keras.models.load_model(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
|
730 |
+
emotion_cnn_lstm = predict_emotion_cnn_lstm(
|
731 |
+
cnn_lstm_loaded, custom_text, data_manager, label_mapping
|
732 |
+
)
|
733 |
+
print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
|
734 |
+
|
735 |
+
# Kiểm tra TF, GPU
|
736 |
+
print("TF version:", tf.__version__)
|
737 |
+
print("GPU devices:", tf.config.list_physical_devices("GPU"))
|
738 |
+
# os.system("nvidia-smi") # nếu muốn xem info GPU
|
main_lstm.py
ADDED
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# lstm_emotion_classifier.py
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import re
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import tensorflow as tf
|
10 |
+
from underthesea import word_tokenize
|
11 |
+
from sklearn.model_selection import train_test_split
|
12 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
13 |
+
from sklearn.utils import resample
|
14 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
15 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
16 |
+
from tensorflow.keras.models import Sequential
|
17 |
+
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
|
18 |
+
from tensorflow.keras.callbacks import EarlyStopping
|
19 |
+
import joblib
|
20 |
+
import os
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
import seaborn as sns
|
23 |
+
|
24 |
+
########################
|
25 |
+
# TIỀN XỬ LÝ
|
26 |
+
########################
|
27 |
+
|
28 |
+
def replace_emojis(sentence, emoji_mapping):
|
29 |
+
processed_sentence = []
|
30 |
+
for char in sentence:
|
31 |
+
if char in emoji_mapping:
|
32 |
+
processed_sentence.append(emoji_mapping[char])
|
33 |
+
elif not emoji.is_emoji(char):
|
34 |
+
processed_sentence.append(char)
|
35 |
+
return ''.join(processed_sentence)
|
36 |
+
|
37 |
+
def remove_profanity(sentence):
|
38 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
39 |
+
words = sentence.split()
|
40 |
+
filtered = [w for w in words if w.lower() not in profane_words]
|
41 |
+
return ' '.join(filtered)
|
42 |
+
|
43 |
+
def remove_special_characters(sentence):
|
44 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
45 |
+
|
46 |
+
def normalize_whitespace(sentence):
|
47 |
+
return ' '.join(sentence.split())
|
48 |
+
|
49 |
+
def remove_repeated_characters(sentence):
|
50 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
51 |
+
|
52 |
+
def replace_numbers(sentence):
|
53 |
+
return re.sub(r"\d+", "[number]", sentence)
|
54 |
+
|
55 |
+
def tokenize_underthesea(sentence):
|
56 |
+
tokens = word_tokenize(sentence)
|
57 |
+
return " ".join(tokens)
|
58 |
+
|
59 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
60 |
+
sentence = sentence.lower()
|
61 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
62 |
+
sentence = remove_profanity(sentence)
|
63 |
+
sentence = remove_special_characters(sentence)
|
64 |
+
sentence = normalize_whitespace(sentence)
|
65 |
+
# Thay thế viết tắt
|
66 |
+
words = sentence.split()
|
67 |
+
replaced = []
|
68 |
+
for w in words:
|
69 |
+
if w in abbreviations:
|
70 |
+
replaced.append(" ".join(abbreviations[w]))
|
71 |
+
else:
|
72 |
+
replaced.append(w)
|
73 |
+
sentence = " ".join(replaced)
|
74 |
+
sentence = remove_repeated_characters(sentence)
|
75 |
+
sentence = replace_numbers(sentence)
|
76 |
+
# Tokenize tiếng Việt
|
77 |
+
sentence = tokenize_underthesea(sentence)
|
78 |
+
return sentence
|
79 |
+
|
80 |
+
emoji_mapping = {
|
81 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
82 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
83 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
84 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
85 |
+
"🤑": "[satisfaction]",
|
86 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
87 |
+
"😏": "[sarcasm]",
|
88 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
89 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
90 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
91 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
92 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
93 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
94 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
95 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
96 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
97 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
98 |
+
}
|
99 |
+
|
100 |
+
def load_abbreviations(path):
|
101 |
+
with open(path, "r", encoding="utf-8") as f:
|
102 |
+
return json.load(f)
|
103 |
+
|
104 |
+
###################################
|
105 |
+
# MAIN
|
106 |
+
###################################
|
107 |
+
if __name__ == "__main__":
|
108 |
+
file_path = "train.xlsx"
|
109 |
+
abbreviations_path = "abbreviations.json"
|
110 |
+
output_path = "processed_phobert.xlsx"
|
111 |
+
|
112 |
+
abbreviations = load_abbreviations(abbreviations_path)
|
113 |
+
|
114 |
+
df = pd.read_excel(file_path)
|
115 |
+
if "Sentence" not in df.columns or "Emotion" not in df.columns:
|
116 |
+
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
|
117 |
+
|
118 |
+
# Tiền xử lý
|
119 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
120 |
+
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
|
121 |
+
)
|
122 |
+
# Loại bỏ rỗng
|
123 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
124 |
+
|
125 |
+
print("Trước khi cân bằng:")
|
126 |
+
print(df["Emotion"].value_counts())
|
127 |
+
|
128 |
+
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
|
129 |
+
# Lấy max samples
|
130 |
+
max_count = df["Emotion"].value_counts().max()
|
131 |
+
|
132 |
+
df_balanced_list = []
|
133 |
+
for emo in df["Emotion"].unique():
|
134 |
+
df_emo = df[df["Emotion"] == emo]
|
135 |
+
if len(df_emo) < max_count:
|
136 |
+
# Oversample lên max_count
|
137 |
+
df_emo_oversampled = resample(
|
138 |
+
df_emo,
|
139 |
+
replace=True,
|
140 |
+
n_samples=max_count,
|
141 |
+
random_state=42
|
142 |
+
)
|
143 |
+
df_balanced_list.append(df_emo_oversampled)
|
144 |
+
else:
|
145 |
+
# Nếu emo này = max_count rồi thì giữ nguyên
|
146 |
+
df_balanced_list.append(df_emo)
|
147 |
+
|
148 |
+
df = pd.concat(df_balanced_list, axis=0)
|
149 |
+
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
150 |
+
|
151 |
+
print("\nSau khi cân bằng tất cả lớp:")
|
152 |
+
print(df["Emotion"].value_counts())
|
153 |
+
|
154 |
+
df.to_excel(output_path, index=False)
|
155 |
+
|
156 |
+
# Tạo label2id và id2label theo thứ tự bạn cung cấp
|
157 |
+
custom_id2label = {
|
158 |
+
0: 'Anger',
|
159 |
+
1: 'Disgust',
|
160 |
+
2: 'Enjoyment',
|
161 |
+
3: 'Fear',
|
162 |
+
4: 'Other',
|
163 |
+
5: 'Sadness',
|
164 |
+
6: 'Surprise'
|
165 |
+
}
|
166 |
+
label2id = {label: idx for idx, label in enumerate(custom_id2label.values())}
|
167 |
+
id2label = {v: k for k, v in label2id.items()}
|
168 |
+
|
169 |
+
df["label_id"] = df["Emotion"].map(label2id)
|
170 |
+
|
171 |
+
# Tách train/test
|
172 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
|
173 |
+
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
|
174 |
+
|
175 |
+
# Feature Extraction với Tokenizer và Padding
|
176 |
+
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
|
177 |
+
tokenizer.fit_on_texts(train_df["processed_sentence"])
|
178 |
+
|
179 |
+
X_train_seq = tokenizer.texts_to_sequences(train_df["processed_sentence"])
|
180 |
+
X_test_seq = tokenizer.texts_to_sequences(test_df["processed_sentence"])
|
181 |
+
|
182 |
+
max_length = 256
|
183 |
+
X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
|
184 |
+
X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
|
185 |
+
|
186 |
+
y_train = train_df["label_id"].values
|
187 |
+
y_test = test_df["label_id"].values
|
188 |
+
|
189 |
+
# Chuyển đổi nhãn thành one-hot encoding
|
190 |
+
num_classes = len(custom_id2label)
|
191 |
+
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
|
192 |
+
y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
|
193 |
+
|
194 |
+
# Xây dựng mô hình LSTM
|
195 |
+
model = Sequential([
|
196 |
+
Embedding(input_dim=5000, output_dim=128, input_length=max_length),
|
197 |
+
LSTM(128, dropout=0.2, recurrent_dropout=0.2),
|
198 |
+
Dense(64, activation='relu'),
|
199 |
+
Dropout(0.5),
|
200 |
+
Dense(num_classes, activation='softmax')
|
201 |
+
])
|
202 |
+
|
203 |
+
model.compile(loss='categorical_crossentropy',
|
204 |
+
optimizer='adam',
|
205 |
+
metrics=['accuracy'])
|
206 |
+
|
207 |
+
model.summary()
|
208 |
+
|
209 |
+
# Huấn luyện mô hình
|
210 |
+
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
|
211 |
+
|
212 |
+
history = model.fit(
|
213 |
+
X_train, y_train,
|
214 |
+
epochs=10,
|
215 |
+
batch_size=32,
|
216 |
+
validation_data=(X_test, y_test),
|
217 |
+
callbacks=[early_stop],
|
218 |
+
verbose=1
|
219 |
+
)
|
220 |
+
|
221 |
+
# Đánh giá mô hình
|
222 |
+
print("\n========== Evaluate on Test set ==========")
|
223 |
+
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
|
224 |
+
print(f"Test Accuracy: {accuracy:.4f}")
|
225 |
+
|
226 |
+
# Dự đoán và in báo cáo phân loại
|
227 |
+
y_pred_probs = model.predict(X_test)
|
228 |
+
y_pred = np.argmax(y_pred_probs, axis=1)
|
229 |
+
y_true = np.argmax(y_test, axis=1)
|
230 |
+
|
231 |
+
# In Classification Report
|
232 |
+
print("\nClassification Report:")
|
233 |
+
report = classification_report(y_true, y_pred, target_names=custom_id2label.values())
|
234 |
+
print(report)
|
235 |
+
|
236 |
+
# Tính và in Confusion Matrix
|
237 |
+
conf_matrix = confusion_matrix(y_true, y_pred)
|
238 |
+
print("\nConfusion Matrix:")
|
239 |
+
print(conf_matrix)
|
240 |
+
|
241 |
+
# Vẽ Confusion Matrix
|
242 |
+
plt.figure(figsize=(10, 8))
|
243 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
|
244 |
+
xticklabels=custom_id2label.values(),
|
245 |
+
yticklabels=custom_id2label.values())
|
246 |
+
plt.ylabel('Actual')
|
247 |
+
plt.xlabel('Predicted')
|
248 |
+
plt.title('Confusion Matrix')
|
249 |
+
plt.tight_layout()
|
250 |
+
plt.savefig(os.path.join("lstm_emotion_model", "confusion_matrix.png"))
|
251 |
+
plt.close()
|
252 |
+
print("\nConfusion Matrix plot saved to 'lstm_emotion_model/confusion_matrix.png'")
|
253 |
+
|
254 |
+
# Lưu Classification Report vào file
|
255 |
+
report_path = os.path.join("lstm_emotion_model", "classification_report.txt")
|
256 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
257 |
+
f.write("========== Classification Report ==========\n")
|
258 |
+
f.write(report)
|
259 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
260 |
+
f.write(np.array2string(conf_matrix))
|
261 |
+
|
262 |
+
print(f"\nClassification Report saved to '{report_path}'")
|
263 |
+
|
264 |
+
# Lưu mô hình và tokenizer
|
265 |
+
model_output_dir = "./lstm_emotion_model"
|
266 |
+
os.makedirs(model_output_dir, exist_ok=True)
|
267 |
+
model.save(os.path.join(model_output_dir, "lstm_emotion_model.h5"))
|
268 |
+
joblib.dump(tokenizer, os.path.join(model_output_dir, "tokenizer.joblib"))
|
269 |
+
with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
|
270 |
+
json.dump(id2label, f, ensure_ascii=False, indent=4)
|
271 |
+
|
272 |
+
print("\n========== Model and Tokenizer saved ==========")
|
273 |
+
|
274 |
+
# Predict 1 câu (ví dụ)
|
275 |
+
def predict_text(text):
|
276 |
+
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
|
277 |
+
seq = tokenizer.texts_to_sequences([text_proc])
|
278 |
+
padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
|
279 |
+
pred_prob = model.predict(padded)
|
280 |
+
pred_id = np.argmax(pred_prob, axis=1)[0]
|
281 |
+
label = custom_id2label[pred_id]
|
282 |
+
return label
|
283 |
+
|
284 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
285 |
+
emotion_pred = predict_text(custom_text)
|
286 |
+
print("\nCâu ví dụ:", custom_text)
|
287 |
+
print("Dự đoán cảm xúc:", emotion_pred)
|
288 |
+
|
289 |
+
print("\nHoàn thành demo LSTM với cân bằng dữ liệu & nhiều epoch hơn!")
|
main_phobert.py
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# phobert_emotion_balanced.py
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import re
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import pandas as pd
|
8 |
+
import torch
|
9 |
+
import numpy as np
|
10 |
+
import os
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
import seaborn as sns
|
13 |
+
|
14 |
+
from transformers import (
|
15 |
+
AutoTokenizer,
|
16 |
+
AutoConfig,
|
17 |
+
AutoModelForSequenceClassification,
|
18 |
+
Trainer,
|
19 |
+
TrainingArguments
|
20 |
+
)
|
21 |
+
|
22 |
+
from sklearn.model_selection import train_test_split
|
23 |
+
from sklearn.utils import resample
|
24 |
+
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
|
25 |
+
|
26 |
+
########################
|
27 |
+
# TIỀN XỬ LÝ
|
28 |
+
########################
|
29 |
+
|
30 |
+
def replace_emojis(sentence, emoji_mapping):
|
31 |
+
processed_sentence = []
|
32 |
+
for char in sentence:
|
33 |
+
if char in emoji_mapping:
|
34 |
+
processed_sentence.append(emoji_mapping[char])
|
35 |
+
elif not emoji.is_emoji(char):
|
36 |
+
processed_sentence.append(char)
|
37 |
+
return ''.join(processed_sentence)
|
38 |
+
|
39 |
+
def remove_profanity(sentence):
|
40 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
41 |
+
words = sentence.split()
|
42 |
+
filtered = [w for w in words if w.lower() not in profane_words]
|
43 |
+
return ' '.join(filtered)
|
44 |
+
|
45 |
+
def remove_special_characters(sentence):
|
46 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
47 |
+
|
48 |
+
def normalize_whitespace(sentence):
|
49 |
+
return ' '.join(sentence.split())
|
50 |
+
|
51 |
+
def remove_repeated_characters(sentence):
|
52 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
53 |
+
|
54 |
+
def replace_numbers(sentence):
|
55 |
+
return re.sub(r"\d+", "[number]", sentence)
|
56 |
+
|
57 |
+
def tokenize_underthesea(sentence):
|
58 |
+
from underthesea import word_tokenize
|
59 |
+
tokens = word_tokenize(sentence)
|
60 |
+
return " ".join(tokens)
|
61 |
+
|
62 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
63 |
+
sentence = sentence.lower()
|
64 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
65 |
+
sentence = remove_profanity(sentence)
|
66 |
+
sentence = remove_special_characters(sentence)
|
67 |
+
sentence = normalize_whitespace(sentence)
|
68 |
+
# Thay thế viết tắt
|
69 |
+
words = sentence.split()
|
70 |
+
replaced = []
|
71 |
+
for w in words:
|
72 |
+
if w in abbreviations:
|
73 |
+
replaced.append(" ".join(abbreviations[w]))
|
74 |
+
else:
|
75 |
+
replaced.append(w)
|
76 |
+
sentence = " ".join(replaced)
|
77 |
+
sentence = remove_repeated_characters(sentence)
|
78 |
+
sentence = replace_numbers(sentence)
|
79 |
+
# Tokenize
|
80 |
+
sentence = tokenize_underthesea(sentence)
|
81 |
+
return sentence
|
82 |
+
|
83 |
+
emoji_mapping = {
|
84 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
85 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
86 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
87 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
88 |
+
"🤑": "[satisfaction]",
|
89 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
90 |
+
"😏": "[sarcasm]",
|
91 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
92 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
93 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
94 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
95 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
96 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
97 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
98 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
99 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
100 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
101 |
+
}
|
102 |
+
|
103 |
+
def load_abbreviations(path):
|
104 |
+
with open(path, "r", encoding="utf-8") as f:
|
105 |
+
return json.load(f)
|
106 |
+
|
107 |
+
# Dataset HF
|
108 |
+
class PhoBertEmotionDataset(torch.utils.data.Dataset):
|
109 |
+
def __init__(self, encodings, labels):
|
110 |
+
self.encodings = encodings
|
111 |
+
self.labels = labels
|
112 |
+
|
113 |
+
def __len__(self):
|
114 |
+
return len(self.labels)
|
115 |
+
|
116 |
+
def __getitem__(self, idx):
|
117 |
+
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
|
118 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
119 |
+
return item
|
120 |
+
|
121 |
+
###################################
|
122 |
+
# MAIN
|
123 |
+
###################################
|
124 |
+
if __name__ == "__main__":
|
125 |
+
file_path = "train.xlsx"
|
126 |
+
abbreviations_path = "abbreviations.json"
|
127 |
+
output_path = "processed_phobert.xlsx"
|
128 |
+
|
129 |
+
abbreviations = load_abbreviations(abbreviations_path)
|
130 |
+
|
131 |
+
df = pd.read_excel(file_path)
|
132 |
+
if "Sentence" not in df.columns or "Emotion" not in df.columns:
|
133 |
+
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
|
134 |
+
|
135 |
+
# Tiền xử lý
|
136 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
137 |
+
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
|
138 |
+
)
|
139 |
+
# Loại bỏ rỗng
|
140 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
141 |
+
|
142 |
+
print("Trước khi cân bằng:")
|
143 |
+
print(df["Emotion"].value_counts())
|
144 |
+
|
145 |
+
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
|
146 |
+
# Lấy max samples
|
147 |
+
max_count = df["Emotion"].value_counts().max()
|
148 |
+
|
149 |
+
df_balanced_list = []
|
150 |
+
for emo in df["Emotion"].unique():
|
151 |
+
df_emo = df[df["Emotion"] == emo]
|
152 |
+
if len(df_emo) < max_count:
|
153 |
+
# Oversample lên max_count
|
154 |
+
df_emo_oversampled = resample(
|
155 |
+
df_emo,
|
156 |
+
replace=True,
|
157 |
+
n_samples=max_count,
|
158 |
+
random_state=42
|
159 |
+
)
|
160 |
+
df_balanced_list.append(df_emo_oversampled)
|
161 |
+
else:
|
162 |
+
# Nếu emo này = max_count rồi thì giữ nguyên
|
163 |
+
df_balanced_list.append(df_emo)
|
164 |
+
|
165 |
+
df = pd.concat(df_balanced_list, axis=0)
|
166 |
+
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
167 |
+
|
168 |
+
print("\nSau khi cân bằng tất cả lớp:")
|
169 |
+
print(df["Emotion"].value_counts())
|
170 |
+
|
171 |
+
df.to_excel(output_path, index=False)
|
172 |
+
|
173 |
+
# Tạo label2id
|
174 |
+
unique_labels = sorted(df["Emotion"].unique()) # Sắp xếp để cố định
|
175 |
+
label2id = {label: i for i, label in enumerate(unique_labels)}
|
176 |
+
id2label = {v: k for k, v in label2id.items()}
|
177 |
+
|
178 |
+
df["label_id"] = df["Emotion"].map(label2id)
|
179 |
+
|
180 |
+
# Tách train/test
|
181 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
|
182 |
+
|
183 |
+
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
|
184 |
+
|
185 |
+
# Load tokenizer
|
186 |
+
checkpoint = "vinai/phobert-base"
|
187 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
188 |
+
|
189 |
+
def tokenize_texts(texts):
|
190 |
+
return tokenizer(
|
191 |
+
texts,
|
192 |
+
padding=True,
|
193 |
+
truncation=True,
|
194 |
+
max_length=256
|
195 |
+
)
|
196 |
+
|
197 |
+
train_texts = train_df["processed_sentence"].tolist()
|
198 |
+
train_labels = train_df["label_id"].tolist()
|
199 |
+
test_texts = test_df["processed_sentence"].tolist()
|
200 |
+
test_labels = test_df["label_id"].tolist()
|
201 |
+
|
202 |
+
train_encodings = tokenize_texts(train_texts)
|
203 |
+
test_encodings = tokenize_texts(test_texts)
|
204 |
+
|
205 |
+
train_dataset = PhoBertEmotionDataset(train_encodings, train_labels)
|
206 |
+
test_dataset = PhoBertEmotionDataset(test_encodings, test_labels)
|
207 |
+
|
208 |
+
# Load model
|
209 |
+
config = AutoConfig.from_pretrained(checkpoint)
|
210 |
+
config.num_labels = len(label2id)
|
211 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
212 |
+
checkpoint,
|
213 |
+
config=config
|
214 |
+
)
|
215 |
+
|
216 |
+
# Tăng epoch lên 10, LR=2e-5
|
217 |
+
training_args = TrainingArguments(
|
218 |
+
output_dir="./phobert_results_v2",
|
219 |
+
overwrite_output_dir=True,
|
220 |
+
do_train=True,
|
221 |
+
do_eval=True,
|
222 |
+
evaluation_strategy="epoch",
|
223 |
+
save_strategy="epoch",
|
224 |
+
num_train_epochs=10, # Tăng epoch
|
225 |
+
per_device_train_batch_size=16,
|
226 |
+
per_device_eval_batch_size=16,
|
227 |
+
learning_rate=2e-5,
|
228 |
+
logging_dir="./logs",
|
229 |
+
logging_steps=50,
|
230 |
+
load_best_model_at_end=True,
|
231 |
+
metric_for_best_model="f1_weighted", # Chọn metric để lưu model tốt nhất
|
232 |
+
greater_is_better=True,
|
233 |
+
seed=42
|
234 |
+
)
|
235 |
+
|
236 |
+
# Define compute_metrics with additional metrics
|
237 |
+
def compute_metrics(eval_pred):
|
238 |
+
logits, labels = eval_pred
|
239 |
+
preds = np.argmax(logits, axis=-1)
|
240 |
+
precision_weighted = precision_score(labels, preds, average='weighted', zero_division=0)
|
241 |
+
recall_weighted = recall_score(labels, preds, average='weighted', zero_division=0)
|
242 |
+
f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0)
|
243 |
+
precision_macro = precision_score(labels, preds, average='macro', zero_division=0)
|
244 |
+
recall_macro = recall_score(labels, preds, average='macro', zero_division=0)
|
245 |
+
f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
|
246 |
+
accuracy = accuracy_score(labels, preds)
|
247 |
+
return {
|
248 |
+
"accuracy": accuracy,
|
249 |
+
"precision_weighted": precision_weighted,
|
250 |
+
"recall_weighted": recall_weighted,
|
251 |
+
"f1_weighted": f1_weighted,
|
252 |
+
"precision_macro": precision_macro,
|
253 |
+
"recall_macro": recall_macro,
|
254 |
+
"f1_macro": f1_macro
|
255 |
+
}
|
256 |
+
|
257 |
+
trainer = Trainer(
|
258 |
+
model=model,
|
259 |
+
args=training_args,
|
260 |
+
train_dataset=train_dataset,
|
261 |
+
eval_dataset=test_dataset,
|
262 |
+
tokenizer=tokenizer,
|
263 |
+
compute_metrics=compute_metrics
|
264 |
+
)
|
265 |
+
|
266 |
+
print("\n========== Training PhoBERT (balanced, more epochs) ==========")
|
267 |
+
trainer.train()
|
268 |
+
|
269 |
+
print("\n========== Evaluate on Test set ==========")
|
270 |
+
results = trainer.evaluate(test_dataset)
|
271 |
+
print("Test results:", results)
|
272 |
+
|
273 |
+
# Extract additional metrics
|
274 |
+
print("\n========== Additional Metrics ==========")
|
275 |
+
print(f"Test Loss: {results.get('eval_loss'):.4f}")
|
276 |
+
print(f"Test Accuracy: {results.get('eval_accuracy'):.4f}")
|
277 |
+
print(f"Precision (Macro): {results.get('eval_precision_macro'):.4f}")
|
278 |
+
print(f"Precision (Weighted): {results.get('eval_precision_weighted'):.4f}")
|
279 |
+
print(f"Recall (Macro): {results.get('eval_recall_macro'):.4f}")
|
280 |
+
print(f"Recall (Weighted): {results.get('eval_recall_weighted'):.4f}")
|
281 |
+
print(f"F1-Score (Macro): {results.get('eval_f1_macro'):.4f}")
|
282 |
+
print(f"F1-Score (Weighted): {results.get('eval_f1_weighted'):.4f}")
|
283 |
+
|
284 |
+
# Generate detailed classification report
|
285 |
+
print("\n========== Detailed Classification Report ==========")
|
286 |
+
predictions, labels, _ = trainer.predict(test_dataset)
|
287 |
+
preds = np.argmax(predictions, axis=1)
|
288 |
+
report = classification_report(labels, preds, target_names=unique_labels, digits=4)
|
289 |
+
print(report)
|
290 |
+
|
291 |
+
# Tính Confusion Matrix
|
292 |
+
conf_matrix = confusion_matrix(labels, preds)
|
293 |
+
print("\nConfusion Matrix:")
|
294 |
+
print(conf_matrix)
|
295 |
+
|
296 |
+
# Vẽ Confusion Matrix
|
297 |
+
plt.figure(figsize=(10, 8))
|
298 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
|
299 |
+
xticklabels=unique_labels,
|
300 |
+
yticklabels=unique_labels)
|
301 |
+
plt.ylabel('Actual')
|
302 |
+
plt.xlabel('Predicted')
|
303 |
+
plt.title('Confusion Matrix')
|
304 |
+
plt.tight_layout()
|
305 |
+
confusion_matrix_path = os.path.join("phobert_emotion_model", "confusion_matrix.png")
|
306 |
+
os.makedirs("phobert_emotion_model", exist_ok=True)
|
307 |
+
plt.savefig(confusion_matrix_path)
|
308 |
+
plt.close()
|
309 |
+
print(f"\nConfusion Matrix plot saved to '{confusion_matrix_path}'")
|
310 |
+
|
311 |
+
# Lưu Classification Report vào file
|
312 |
+
report_path = os.path.join("phobert_emotion_model", "classification_report.txt")
|
313 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
314 |
+
f.write("========== Classification Report ==========\n")
|
315 |
+
f.write(report)
|
316 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
317 |
+
f.write(np.array2string(conf_matrix))
|
318 |
+
|
319 |
+
print(f"\nClassification Report saved to '{report_path}'")
|
320 |
+
|
321 |
+
# Lưu mô hình và tokenizer
|
322 |
+
model_output_dir = "./phobert_emotion_model"
|
323 |
+
os.makedirs(model_output_dir, exist_ok=True)
|
324 |
+
model.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
|
325 |
+
tokenizer.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
|
326 |
+
with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
|
327 |
+
json.dump(id2label, f, ensure_ascii=False, indent=4)
|
328 |
+
|
329 |
+
print("\n========== Model and Tokenizer saved ==========")
|
330 |
+
|
331 |
+
# Predict 1 câu (ví dụ)
|
332 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
333 |
+
model.to(device)
|
334 |
+
|
335 |
+
def predict_text(text):
|
336 |
+
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
|
337 |
+
enc = tokenizer(text_proc, padding=True, truncation=True, max_length=256, return_tensors="pt")
|
338 |
+
enc = {k: v.to(device) for k, v in enc.items()}
|
339 |
+
with torch.no_grad():
|
340 |
+
out = model(**enc)
|
341 |
+
pred_id = out.logits.argmax(dim=-1).item()
|
342 |
+
return id2label[pred_id]
|
343 |
+
|
344 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
345 |
+
emotion_pred = predict_text(custom_text)
|
346 |
+
print("\nCâu ví dụ:", custom_text)
|
347 |
+
print("Dự đoán cảm xúc:", emotion_pred)
|
348 |
+
|
349 |
+
print("\nHoàn thành demo PhoBERT với cân bằng dữ liệu & nhiều epoch hơn!")
|
main_svm.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# svm_emotion_classifier.py
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import re
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import torch # Có thể không cần thiết cho SVM, nhưng giữ lại nếu cần
|
10 |
+
from underthesea import word_tokenize
|
11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
+
from sklearn.svm import SVC
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from sklearn.metrics import (
|
15 |
+
accuracy_score,
|
16 |
+
classification_report,
|
17 |
+
precision_score,
|
18 |
+
recall_score,
|
19 |
+
f1_score,
|
20 |
+
confusion_matrix
|
21 |
+
)
|
22 |
+
from sklearn.utils import resample
|
23 |
+
import joblib
|
24 |
+
import os
|
25 |
+
|
26 |
+
########################
|
27 |
+
# TIỀN XỬ LÝ
|
28 |
+
########################
|
29 |
+
|
30 |
+
def replace_emojis(sentence, emoji_mapping):
|
31 |
+
processed_sentence = []
|
32 |
+
for char in sentence:
|
33 |
+
if char in emoji_mapping:
|
34 |
+
processed_sentence.append(emoji_mapping[char])
|
35 |
+
elif not emoji.is_emoji(char):
|
36 |
+
processed_sentence.append(char)
|
37 |
+
return ''.join(processed_sentence)
|
38 |
+
|
39 |
+
def remove_profanity(sentence):
|
40 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
41 |
+
words = sentence.split()
|
42 |
+
filtered = [w for w in words if w.lower() not in profane_words]
|
43 |
+
return ' '.join(filtered)
|
44 |
+
|
45 |
+
def remove_special_characters(sentence):
|
46 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
47 |
+
|
48 |
+
def normalize_whitespace(sentence):
|
49 |
+
return ' '.join(sentence.split())
|
50 |
+
|
51 |
+
def remove_repeated_characters(sentence):
|
52 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
53 |
+
|
54 |
+
def replace_numbers(sentence):
|
55 |
+
return re.sub(r"\d+", "[number]", sentence)
|
56 |
+
|
57 |
+
def tokenize_underthesea(sentence):
|
58 |
+
tokens = word_tokenize(sentence)
|
59 |
+
return " ".join(tokens)
|
60 |
+
|
61 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
62 |
+
sentence = sentence.lower()
|
63 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
64 |
+
sentence = remove_profanity(sentence)
|
65 |
+
sentence = remove_special_characters(sentence)
|
66 |
+
sentence = normalize_whitespace(sentence)
|
67 |
+
# Thay thế viết tắt
|
68 |
+
words = sentence.split()
|
69 |
+
replaced = []
|
70 |
+
for w in words:
|
71 |
+
if w in abbreviations:
|
72 |
+
replaced.append(" ".join(abbreviations[w]))
|
73 |
+
else:
|
74 |
+
replaced.append(w)
|
75 |
+
sentence = " ".join(replaced)
|
76 |
+
sentence = remove_repeated_characters(sentence)
|
77 |
+
sentence = replace_numbers(sentence)
|
78 |
+
# Tokenize tiếng Việt
|
79 |
+
sentence = tokenize_underthesea(sentence)
|
80 |
+
return sentence
|
81 |
+
|
82 |
+
emoji_mapping = {
|
83 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
84 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
85 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
86 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
87 |
+
"🤑": "[satisfaction]",
|
88 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
89 |
+
"😏": "[sarcasm]",
|
90 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
91 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
92 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
93 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
94 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
95 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
96 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
97 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
98 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
99 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
100 |
+
}
|
101 |
+
|
102 |
+
def load_abbreviations(path):
|
103 |
+
with open(path, "r", encoding="utf-8") as f:
|
104 |
+
return json.load(f)
|
105 |
+
|
106 |
+
###################################
|
107 |
+
# MAIN
|
108 |
+
###################################
|
109 |
+
if __name__ == "__main__":
|
110 |
+
file_path = "train.xlsx"
|
111 |
+
abbreviations_path = "abbreviations.json"
|
112 |
+
output_path = "processed_svm.xlsx" # Changed output filename to reflect SVM
|
113 |
+
|
114 |
+
abbreviations = load_abbreviations(abbreviations_path)
|
115 |
+
|
116 |
+
df = pd.read_excel(file_path)
|
117 |
+
if "Sentence" not in df.columns or "Emotion" not in df.columns:
|
118 |
+
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
|
119 |
+
|
120 |
+
# Tiền xử lý
|
121 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
122 |
+
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
|
123 |
+
)
|
124 |
+
# Loại bỏ rỗng
|
125 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
126 |
+
|
127 |
+
print("Trước khi cân bằng:")
|
128 |
+
print(df["Emotion"].value_counts())
|
129 |
+
|
130 |
+
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
|
131 |
+
# Lấy max samples
|
132 |
+
max_count = df["Emotion"].value_counts().max()
|
133 |
+
|
134 |
+
df_balanced_list = []
|
135 |
+
for emo in df["Emotion"].unique():
|
136 |
+
df_emo = df[df["Emotion"] == emo]
|
137 |
+
if len(df_emo) < max_count:
|
138 |
+
# Oversample lên max_count
|
139 |
+
df_emo_oversampled = resample(
|
140 |
+
df_emo,
|
141 |
+
replace=True,
|
142 |
+
n_samples=max_count,
|
143 |
+
random_state=42
|
144 |
+
)
|
145 |
+
df_balanced_list.append(df_emo_oversampled)
|
146 |
+
else:
|
147 |
+
# Nếu emo này = max_count rồi thì giữ nguyên
|
148 |
+
df_balanced_list.append(df_emo)
|
149 |
+
|
150 |
+
df = pd.concat(df_balanced_list, axis=0)
|
151 |
+
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
152 |
+
|
153 |
+
print("\nSau khi cân bằng tất cả lớp:")
|
154 |
+
print(df["Emotion"].value_counts())
|
155 |
+
|
156 |
+
df.to_excel(output_path, index=False)
|
157 |
+
|
158 |
+
# Tạo label2id và id2label theo thứ tự bạn cung cấp
|
159 |
+
custom_id2label = {
|
160 |
+
0: 'Anger',
|
161 |
+
1: 'Disgust',
|
162 |
+
2: 'Enjoyment',
|
163 |
+
3: 'Fear',
|
164 |
+
4: 'Other',
|
165 |
+
5: 'Sadness',
|
166 |
+
6: 'Surprise'
|
167 |
+
}
|
168 |
+
label2id = {label: idx for idx, label in custom_id2label.items()}
|
169 |
+
id2label = {v: k for k, v in label2id.items()}
|
170 |
+
|
171 |
+
df["label_id"] = df["Emotion"].map(label2id)
|
172 |
+
if df["label_id"].isnull().any():
|
173 |
+
missing = df[df["label_id"].isnull()]["Emotion"].unique()
|
174 |
+
raise ValueError(f"Những nhãn cảm xúc sau không có trong label2id: {missing}")
|
175 |
+
|
176 |
+
# Tách train/test
|
177 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
|
178 |
+
|
179 |
+
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
|
180 |
+
|
181 |
+
# Feature Extraction với TF-IDF
|
182 |
+
vectorizer = TfidfVectorizer(max_features=5000)
|
183 |
+
X_train = vectorizer.fit_transform(train_df["processed_sentence"])
|
184 |
+
X_test = vectorizer.transform(test_df["processed_sentence"])
|
185 |
+
y_train = train_df["label_id"].values
|
186 |
+
y_test = test_df["label_id"].values
|
187 |
+
|
188 |
+
# Huấn luyện mô hình SVM
|
189 |
+
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
|
190 |
+
print("\n========== Training SVM ==========")
|
191 |
+
svm_classifier.fit(X_train, y_train)
|
192 |
+
|
193 |
+
# Đánh giá mô hình
|
194 |
+
print("\n========== Evaluate on Test set ==========")
|
195 |
+
y_pred = svm_classifier.predict(X_test)
|
196 |
+
|
197 |
+
# Tính các chỉ số
|
198 |
+
accuracy = accuracy_score(y_test, y_pred)
|
199 |
+
precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
|
200 |
+
precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
|
201 |
+
recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
|
202 |
+
recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
|
203 |
+
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
|
204 |
+
f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
|
205 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
206 |
+
|
207 |
+
# In các chỉ số
|
208 |
+
print(f"Test Accuracy: {accuracy:.4f}")
|
209 |
+
print(f"Precision (Macro): {precision_macro:.4f}")
|
210 |
+
print(f"Precision (Weighted): {precision_weighted:.4f}")
|
211 |
+
print(f"Recall (Macro): {recall_macro:.4f}")
|
212 |
+
print(f"Recall (Weighted): {recall_weighted:.4f}")
|
213 |
+
print(f"F1-Score (Macro): {f1_macro:.4f}")
|
214 |
+
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
|
215 |
+
|
216 |
+
print("\n========== Classification Report ==========")
|
217 |
+
report = classification_report(y_test, y_pred, target_names=custom_id2label.values(), digits=4)
|
218 |
+
print(report)
|
219 |
+
|
220 |
+
# Lưu báo cáo vào file
|
221 |
+
report_path = os.path.join("svm_emotion_model", "classification_report.txt")
|
222 |
+
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
223 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
224 |
+
f.write("========== Classification Report ==========\n")
|
225 |
+
f.write(report)
|
226 |
+
f.write("\n========== Additional Metrics ==========\n")
|
227 |
+
f.write(f"Accuracy: {accuracy:.4f}\n")
|
228 |
+
f.write(f"Precision (Macro): {precision_macro:.4f}\n")
|
229 |
+
f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
|
230 |
+
f.write(f"Recall (Macro): {recall_macro:.4f}\n")
|
231 |
+
f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
|
232 |
+
f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
|
233 |
+
f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
|
234 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
235 |
+
f.write(np.array2string(conf_matrix))
|
236 |
+
|
237 |
+
print("\n========== Classification Report saved to 'svm_emotion_model/classification_report.txt' ==========")
|
238 |
+
|
239 |
+
# Lưu mô hình và các thành phần cần thiết
|
240 |
+
model_output_dir = "./svm_emotion_model"
|
241 |
+
os.makedirs(model_output_dir, exist_ok=True)
|
242 |
+
joblib.dump(svm_classifier, os.path.join(model_output_dir, "svm_classifier.joblib"))
|
243 |
+
joblib.dump(vectorizer, os.path.join(model_output_dir, "tfidf_vectorizer.joblib"))
|
244 |
+
joblib.dump(id2label, os.path.join(model_output_dir, "id2label.json"))
|
245 |
+
|
246 |
+
print("\n========== Model and Vectorizer saved ==========")
|
247 |
+
|
248 |
+
# Predict 1 câu (ví dụ)
|
249 |
+
def predict_text(text):
|
250 |
+
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
|
251 |
+
X = vectorizer.transform([text_proc])
|
252 |
+
pred_id = svm_classifier.predict(X)[0]
|
253 |
+
label = custom_id2label[pred_id]
|
254 |
+
return label
|
255 |
+
|
256 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
257 |
+
emotion_pred = predict_text(custom_text)
|
258 |
+
print("\nCâu ví dụ:", custom_text)
|
259 |
+
print("Dự đoán cảm xúc:", emotion_pred)
|
260 |
+
|
261 |
+
print("\nHoàn thành demo SVM với cân bằng dữ liệu & nhiều chỉ số đánh giá!")
|
main_v1.py
ADDED
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# thesis.py
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import emoji
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
from underthesea import word_tokenize
|
9 |
+
from tqdm import tqdm
|
10 |
+
import torch
|
11 |
+
from torchtext.vocab import Vectors
|
12 |
+
from sklearn.model_selection import train_test_split
|
13 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
14 |
+
from torch.utils.data import DataLoader, TensorDataset
|
15 |
+
import torch.nn as nn
|
16 |
+
import torch.optim as optim
|
17 |
+
import numpy as np
|
18 |
+
import tensorflow as tf
|
19 |
+
|
20 |
+
# ========== CÁC HÀM TIỀN XỬ LÝ ==========
|
21 |
+
|
22 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
23 |
+
"""
|
24 |
+
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
|
25 |
+
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
|
26 |
+
"""
|
27 |
+
sentence = sentence.lower()
|
28 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
29 |
+
sentence = remove_profanity(sentence)
|
30 |
+
sentence = remove_special_characters(sentence)
|
31 |
+
sentence = normalize_whitespace(sentence)
|
32 |
+
sentence = replace_abbreviations(sentence, abbreviations)
|
33 |
+
sentence = remove_repeated_characters(sentence)
|
34 |
+
sentence = replace_numbers(sentence)
|
35 |
+
sentence = tokenize_sentence(sentence)
|
36 |
+
return sentence
|
37 |
+
|
38 |
+
def replace_emojis(sentence, emoji_mapping):
|
39 |
+
processed_sentence = []
|
40 |
+
for char in sentence:
|
41 |
+
if char in emoji_mapping:
|
42 |
+
processed_sentence.append(emoji_mapping[char])
|
43 |
+
elif not emoji.is_emoji(char):
|
44 |
+
processed_sentence.append(char)
|
45 |
+
return ''.join(processed_sentence)
|
46 |
+
|
47 |
+
def remove_profanity(sentence):
|
48 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
49 |
+
words = sentence.split()
|
50 |
+
filtered_words = [word for word in words if word.lower() not in profane_words]
|
51 |
+
return ' '.join(filtered_words)
|
52 |
+
|
53 |
+
def remove_special_characters(sentence):
|
54 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
55 |
+
|
56 |
+
def normalize_whitespace(sentence):
|
57 |
+
return ' '.join(sentence.split())
|
58 |
+
|
59 |
+
def replace_abbreviations(sentence, abbreviations):
|
60 |
+
words = sentence.split()
|
61 |
+
replaced_words = [
|
62 |
+
" ".join(abbreviations[word]) if word in abbreviations else word
|
63 |
+
for word in words
|
64 |
+
]
|
65 |
+
return ' '.join(replaced_words)
|
66 |
+
|
67 |
+
def remove_repeated_characters(sentence):
|
68 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
69 |
+
|
70 |
+
def replace_numbers(sentence):
|
71 |
+
return re.sub(r"\d+", "[number]", sentence)
|
72 |
+
|
73 |
+
def tokenize_sentence(sentence):
|
74 |
+
return ' '.join(word_tokenize(sentence))
|
75 |
+
|
76 |
+
|
77 |
+
# ========== LỚP DATA MANAGER ==========
|
78 |
+
|
79 |
+
class DataManager:
|
80 |
+
def __init__(self, file_path, abbreviations_path, word2vec_path):
|
81 |
+
self.file_path = file_path
|
82 |
+
self.abbreviations_path = abbreviations_path
|
83 |
+
self.word2vec_path = word2vec_path
|
84 |
+
self.load_abbreviations()
|
85 |
+
self.load_word2vec()
|
86 |
+
|
87 |
+
def load_abbreviations(self):
|
88 |
+
with open(self.abbreviations_path, "r", encoding="utf-8") as file:
|
89 |
+
self.abbreviations = json.load(file)
|
90 |
+
|
91 |
+
def load_word2vec(self):
|
92 |
+
# Tải vector từ file word2vec, unk_init để từ vựng ngoài tập sẽ random normal
|
93 |
+
self.word_embeddings = Vectors(name=self.word2vec_path, unk_init=torch.Tensor.normal_)
|
94 |
+
self.vocabulary = self.create_vocab_from_word2vec()
|
95 |
+
|
96 |
+
def create_vocab_from_word2vec(self):
|
97 |
+
vocab = Vocabulary()
|
98 |
+
words_list = list(self.word_embeddings.stoi.keys())
|
99 |
+
for word in words_list:
|
100 |
+
vocab.add(word)
|
101 |
+
return vocab
|
102 |
+
|
103 |
+
def preprocess_data(self):
|
104 |
+
df = pd.read_excel(self.file_path)
|
105 |
+
if "Sentence" not in df.columns:
|
106 |
+
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
|
107 |
+
|
108 |
+
# Tiền xử lý từng câu
|
109 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
110 |
+
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
|
111 |
+
)
|
112 |
+
|
113 |
+
# Loại bỏ những dòng rỗng sau khi xử lý
|
114 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
115 |
+
return df
|
116 |
+
|
117 |
+
def split_and_convert(
|
118 |
+
self, df, label_column="Emotion", maxlen=400, test_size=0.2,
|
119 |
+
for_keras=False, batch_size=32
|
120 |
+
):
|
121 |
+
"""
|
122 |
+
Chia dữ liệu thành train/test. Trả về:
|
123 |
+
- Nếu for_keras=False: train_loader, test_loader, label_mapping (PyTorch)
|
124 |
+
- Nếu for_keras=True: X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
|
125 |
+
"""
|
126 |
+
|
127 |
+
if label_column not in df.columns:
|
128 |
+
raise ValueError(
|
129 |
+
f"Cột '{label_column}' không tồn tại trong DataFrame. "
|
130 |
+
f"Các cột hiện có: {df.columns.tolist()}"
|
131 |
+
)
|
132 |
+
|
133 |
+
# Tạo mapping nhãn -> số
|
134 |
+
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
|
135 |
+
df[label_column] = df[label_column].map(label_mapping)
|
136 |
+
|
137 |
+
X = df["processed_sentence"].tolist()
|
138 |
+
y = df[label_column].tolist()
|
139 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
|
140 |
+
|
141 |
+
# Chuyển văn bản thành tensor chỉ số
|
142 |
+
X_train_tensors = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
|
143 |
+
X_test_tensors = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
|
144 |
+
|
145 |
+
# Pad sequences
|
146 |
+
X_train_padded = pad_sequences(X_train_tensors, maxlen=maxlen)
|
147 |
+
X_test_padded = pad_sequences(X_test_tensors, maxlen=maxlen)
|
148 |
+
|
149 |
+
# Debug thông tin
|
150 |
+
print(">>> Debug Split and Convert:")
|
151 |
+
print("X_train_padded.shape:", X_train_padded.shape)
|
152 |
+
print("X_test_padded.shape: ", X_test_padded.shape)
|
153 |
+
print("y_train length:", len(y_train))
|
154 |
+
print("y_test length: ", len(y_test))
|
155 |
+
|
156 |
+
# Kiểm tra min/max token
|
157 |
+
max_token_train = np.max(X_train_padded) if X_train_padded.size > 0 else None
|
158 |
+
min_token_train = np.min(X_train_padded) if X_train_padded.size > 0 else None
|
159 |
+
max_token_test = np.max(X_test_padded) if X_test_padded.size > 0 else None
|
160 |
+
min_token_test = np.min(X_test_padded) if X_test_padded.size > 0 else None
|
161 |
+
|
162 |
+
vocab_size = len(self.vocabulary)
|
163 |
+
print(f"vocab_size: {vocab_size}")
|
164 |
+
print(f"max_token_train: {max_token_train}, min_token_train: {min_token_train}")
|
165 |
+
print(f"max_token_test: {max_token_test}, min_token_test: {min_token_test}")
|
166 |
+
|
167 |
+
if for_keras:
|
168 |
+
num_classes = len(label_mapping)
|
169 |
+
# One-hot cho nhãn
|
170 |
+
y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train), num_classes=num_classes).numpy()
|
171 |
+
y_test_onehot = torch.nn.functional.one_hot(torch.tensor(y_test), num_classes=num_classes).numpy()
|
172 |
+
|
173 |
+
# Debug
|
174 |
+
print("y_train_onehot.shape:", y_train_onehot.shape)
|
175 |
+
print("y_test_onehot.shape: ", y_test_onehot.shape)
|
176 |
+
|
177 |
+
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
|
178 |
+
else:
|
179 |
+
# Trả về DataLoader cho PyTorch
|
180 |
+
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
|
181 |
+
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long)
|
182 |
+
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
|
183 |
+
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
|
184 |
+
|
185 |
+
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
|
186 |
+
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
|
187 |
+
|
188 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
189 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
|
190 |
+
return train_loader, test_loader, label_mapping
|
191 |
+
|
192 |
+
|
193 |
+
# ========== LỚP TỪ ĐIỂN (VOCABULARY) ==========
|
194 |
+
|
195 |
+
class Vocabulary:
|
196 |
+
def __init__(self):
|
197 |
+
self.word2id = {}
|
198 |
+
self.word2id['<pad>'] = 0
|
199 |
+
self.word2id['<unk>'] = 1
|
200 |
+
self.unk_id = self.word2id['<unk>']
|
201 |
+
self.id2word = {0: '<pad>', 1: '<unk>'}
|
202 |
+
|
203 |
+
def __getitem__(self, word):
|
204 |
+
return self.word2id.get(word, self.unk_id)
|
205 |
+
|
206 |
+
def __contains__(self, word):
|
207 |
+
return word in self.word2id
|
208 |
+
|
209 |
+
def __len__(self):
|
210 |
+
return len(self.word2id)
|
211 |
+
|
212 |
+
def lookup_tokens(self, word_indexes: list):
|
213 |
+
return [self.id2word[word_index] for word_index in word_indexes]
|
214 |
+
|
215 |
+
def add(self, word):
|
216 |
+
if word not in self:
|
217 |
+
word_index = len(self.word2id)
|
218 |
+
self.word2id[word] = word_index
|
219 |
+
self.id2word[word_index] = word
|
220 |
+
return word_index
|
221 |
+
else:
|
222 |
+
return self[word]
|
223 |
+
|
224 |
+
@staticmethod
|
225 |
+
def tokenize_corpus(corpus):
|
226 |
+
tokenized_corpus = []
|
227 |
+
for document in tqdm(corpus):
|
228 |
+
tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
|
229 |
+
tokenized_corpus.append(tokenized_document)
|
230 |
+
return tokenized_corpus
|
231 |
+
|
232 |
+
def corpus_to_tensor(self, corpus, is_tokenized=False):
|
233 |
+
tokenized_corpus = self.tokenize_corpus(corpus) if not is_tokenized else corpus
|
234 |
+
return [
|
235 |
+
[self[word] for word in document]
|
236 |
+
for document in tokenized_corpus
|
237 |
+
]
|
238 |
+
|
239 |
+
|
240 |
+
# ========== MAPPING EMOJI => NHÃN ==========
|
241 |
+
|
242 |
+
emoji_mapping = {
|
243 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
244 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
245 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
246 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
247 |
+
"🤑": "[satisfaction]",
|
248 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
249 |
+
"😏": "[sarcasm]",
|
250 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
251 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
252 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
253 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
254 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
255 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
256 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
257 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
258 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
259 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
260 |
+
}
|
261 |
+
|
262 |
+
|
263 |
+
# ========== ĐỊNH NGHĨA MÔ HÌNH RNN PYTORCH ==========
|
264 |
+
|
265 |
+
class SimpleRNN(nn.Module):
|
266 |
+
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
|
267 |
+
super(SimpleRNN, self).__init__()
|
268 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim)
|
269 |
+
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
|
270 |
+
self.fc = nn.Linear(hidden_dim, output_dim)
|
271 |
+
|
272 |
+
def forward(self, x):
|
273 |
+
embedded = self.embedding(x)
|
274 |
+
_, (hidden, _) = self.rnn(embedded)
|
275 |
+
return self.fc(hidden.squeeze(0))
|
276 |
+
|
277 |
+
|
278 |
+
# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH PYTORCH RNN ==========
|
279 |
+
|
280 |
+
def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
|
281 |
+
model.eval()
|
282 |
+
with torch.no_grad():
|
283 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
284 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
285 |
+
text_tensor = torch.tensor(
|
286 |
+
pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400),
|
287 |
+
dtype=torch.long
|
288 |
+
).to(device)
|
289 |
+
|
290 |
+
output = model(text_tensor)
|
291 |
+
_, predicted = torch.max(output, 1)
|
292 |
+
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
|
293 |
+
return reverse_label_mapping[predicted.item()]
|
294 |
+
|
295 |
+
|
296 |
+
# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH KERAS CNN-LSTM ==========
|
297 |
+
|
298 |
+
def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
|
299 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
300 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
301 |
+
text_tensor = pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400)
|
302 |
+
output = model.predict(text_tensor)
|
303 |
+
predicted = output.argmax(axis=1)[0]
|
304 |
+
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
|
305 |
+
return reverse_label_mapping[predicted]
|
306 |
+
|
307 |
+
|
308 |
+
# ========== PHẦN MAIN (CHẠY THỬ) ==========
|
309 |
+
|
310 |
+
if __name__ == "__main__":
|
311 |
+
# --------------------------
|
312 |
+
# Thay đường dẫn tại đây:
|
313 |
+
# --------------------------
|
314 |
+
file_path = "train.xlsx" # file Excel gốc (chứa cột "Sentence", "Emotion", ...)
|
315 |
+
abbreviations_path = "abbreviations.json"
|
316 |
+
word2vec_path = "/home/datpham/datpham/thesis-ngtram/word2vec_vi_syllables_100dims.txt"
|
317 |
+
output_path = "processed.xlsx"
|
318 |
+
|
319 |
+
data_manager = DataManager(
|
320 |
+
file_path=file_path,
|
321 |
+
abbreviations_path=abbreviations_path,
|
322 |
+
word2vec_path=word2vec_path
|
323 |
+
)
|
324 |
+
|
325 |
+
# 1) Đọc và tiền xử lý
|
326 |
+
df = data_manager.preprocess_data()
|
327 |
+
print("Trước khi undersampling:")
|
328 |
+
print(df["Emotion"].value_counts())
|
329 |
+
|
330 |
+
# 2) UNDERSAMPLING (Ví dụ)
|
331 |
+
# Chỉnh lại tên emotion cụ thể cho phù hợp tập dữ liệu của bạn
|
332 |
+
df_enjoyment = df[df["Emotion"] == "Enjoyment"]
|
333 |
+
df_other = df[df["Emotion"] == "Other"]
|
334 |
+
df_anger = df[df["Emotion"] == "Anger"]
|
335 |
+
df_sadness = df[df["Emotion"] == "Sadness"]
|
336 |
+
df_disgust = df[df["Emotion"] == "Disgust"]
|
337 |
+
df_fear = df[df["Emotion"] == "Fear"]
|
338 |
+
df_surprise = df[df["Emotion"] == "Surprise"]
|
339 |
+
|
340 |
+
# Ví dụ: Chọn 2000 mẫu cho 'Enjoyment'
|
341 |
+
if len(df_enjoyment) > 2000:
|
342 |
+
df_enjoyment_undersampled = df_enjoyment.sample(n=2000, random_state=42)
|
343 |
+
else:
|
344 |
+
df_enjoyment_undersampled = df_enjoyment
|
345 |
+
|
346 |
+
df_balanced = pd.concat([
|
347 |
+
df_enjoyment_undersampled,
|
348 |
+
df_other,
|
349 |
+
df_anger,
|
350 |
+
df_sadness,
|
351 |
+
df_disgust,
|
352 |
+
df_fear,
|
353 |
+
df_surprise
|
354 |
+
], axis=0)
|
355 |
+
|
356 |
+
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
357 |
+
df = df_balanced
|
358 |
+
|
359 |
+
print("\nSau khi undersampling:")
|
360 |
+
print(df["Emotion"].value_counts())
|
361 |
+
|
362 |
+
df.to_excel(output_path, index=False)
|
363 |
+
|
364 |
+
# 3) Tạo data loader cho PyTorch
|
365 |
+
train_loader, test_loader, label_mapping = data_manager.split_and_convert(
|
366 |
+
df, label_column="Emotion", for_keras=False
|
367 |
+
)
|
368 |
+
|
369 |
+
vocab_size = len(data_manager.vocabulary)
|
370 |
+
embedding_dim = 100
|
371 |
+
hidden_dim = 128
|
372 |
+
output_dim = len(label_mapping)
|
373 |
+
|
374 |
+
model_rnn = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
|
375 |
+
criterion = nn.CrossEntropyLoss()
|
376 |
+
optimizer = optim.Adam(model_rnn.parameters())
|
377 |
+
|
378 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
379 |
+
model_rnn.to(device)
|
380 |
+
|
381 |
+
num_epochs = 20
|
382 |
+
for epoch in range(num_epochs):
|
383 |
+
model_rnn.train()
|
384 |
+
epoch_loss = 0
|
385 |
+
correct = 0
|
386 |
+
total = 0
|
387 |
+
for X_batch, y_batch in train_loader:
|
388 |
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
389 |
+
|
390 |
+
optimizer.zero_grad()
|
391 |
+
predictions = model_rnn(X_batch)
|
392 |
+
loss = criterion(predictions, y_batch)
|
393 |
+
loss.backward()
|
394 |
+
optimizer.step()
|
395 |
+
|
396 |
+
epoch_loss += loss.item()
|
397 |
+
_, predicted = torch.max(predictions, 1)
|
398 |
+
correct += (predicted == y_batch).sum().item()
|
399 |
+
total += y_batch.size(0)
|
400 |
+
|
401 |
+
print(f"Epoch {epoch+1}/{num_epochs}, "
|
402 |
+
f"Loss: {epoch_loss/len(train_loader):.4f}, "
|
403 |
+
f"Accuracy: {correct/total:.4f}")
|
404 |
+
|
405 |
+
# Đánh giá RNN trên test set
|
406 |
+
model_rnn.eval()
|
407 |
+
test_loss = 0
|
408 |
+
correct = 0
|
409 |
+
total = 0
|
410 |
+
with torch.no_grad():
|
411 |
+
for X_batch, y_batch in test_loader:
|
412 |
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
413 |
+
predictions = model_rnn(X_batch)
|
414 |
+
loss = criterion(predictions, y_batch)
|
415 |
+
test_loss += loss.item()
|
416 |
+
|
417 |
+
_, predicted = torch.max(predictions, 1)
|
418 |
+
correct += (predicted == y_batch).sum().item()
|
419 |
+
total += y_batch.size(0)
|
420 |
+
|
421 |
+
print(f"Test Loss: {test_loss/len(test_loader):.4f}, "
|
422 |
+
f"Test Accuracy: {correct/total:.4f}")
|
423 |
+
|
424 |
+
|
425 |
+
# ========== CNN-LSTM (Keras) ==========
|
426 |
+
|
427 |
+
from keras.models import Model
|
428 |
+
from keras.layers import Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
|
429 |
+
from keras.optimizers import Adam
|
430 |
+
from keras.callbacks import ModelCheckpoint
|
431 |
+
|
432 |
+
print("Training CNN-LSTM...")
|
433 |
+
|
434 |
+
X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
|
435 |
+
df, label_column="Emotion", for_keras=True
|
436 |
+
)
|
437 |
+
|
438 |
+
maxlen = 400
|
439 |
+
|
440 |
+
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
|
441 |
+
emb_layer = Embedding(len(data_manager.vocabulary), embedding_dim)(input_layer)
|
442 |
+
|
443 |
+
def max_1d(X):
|
444 |
+
return tf.reduce_max(X, axis=1)
|
445 |
+
|
446 |
+
con3_layer = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
|
447 |
+
pool_con3_layer = Lambda(max_1d, output_shape=(150,))(con3_layer)
|
448 |
+
|
449 |
+
con5_layer = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
|
450 |
+
pool_con5_layer = Lambda(max_1d, output_shape=(150,))(con5_layer)
|
451 |
+
|
452 |
+
lstm_layer = LSTM(128)(emb_layer)
|
453 |
+
|
454 |
+
cnn_lstm_layer = concatenate([pool_con3_layer, pool_con5_layer, lstm_layer])
|
455 |
+
|
456 |
+
dense_layer = Dense(100, activation='relu')(cnn_lstm_layer)
|
457 |
+
dropout_layer = Dropout(0.2)(dense_layer)
|
458 |
+
output_layer = Dense(len(label_mapping), activation='softmax')(dropout_layer)
|
459 |
+
|
460 |
+
model_cnn_lstm = Model(inputs=input_layer, outputs=output_layer)
|
461 |
+
model_cnn_lstm.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
|
462 |
+
|
463 |
+
checkpoint = ModelCheckpoint('cnn_lstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max')
|
464 |
+
model_cnn_lstm.fit(
|
465 |
+
X_train, y_train,
|
466 |
+
validation_data=(X_test, y_test),
|
467 |
+
batch_size=32,
|
468 |
+
epochs=20,
|
469 |
+
callbacks=[checkpoint]
|
470 |
+
)
|
471 |
+
|
472 |
+
model_cnn_lstm.save('cnn_lstm_model.keras')
|
473 |
+
|
474 |
+
loss, accuracy = model_cnn_lstm.evaluate(X_test, y_test)
|
475 |
+
print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
|
476 |
+
|
477 |
+
# Demo dự đoán 1 câu mới
|
478 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
479 |
+
|
480 |
+
# RNN (PyTorch)
|
481 |
+
emotion_rnn = predict_emotion_rnn(model_rnn, custom_text, data_manager, label_mapping, device)
|
482 |
+
print(f"Predicted Emotion (RNN): {emotion_rnn}")
|
483 |
+
|
484 |
+
# CNN-LSTM (Keras)
|
485 |
+
cnn_lstm_model = tf.keras.models.load_model('cnn_lstm_model.keras')
|
486 |
+
emotion_cnn_lstm = predict_emotion_cnn_lstm(cnn_lstm_model, custom_text, data_manager, label_mapping)
|
487 |
+
print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
|
488 |
+
|
489 |
+
# Kiểm tra phiên bản TF, GPU
|
490 |
+
print("TF version:", tf.__version__)
|
491 |
+
print("GPU devices:", tf.config.list_physical_devices("GPU"))
|
492 |
+
# Có thể kiểm tra CUDA/GPU thông qua lệnh system sau (nếu muốn):
|
493 |
+
# import os
|
494 |
+
# os.system("nvidia-smi")
|
phobert_emotion_model/classification_report.txt
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
========== Classification Report ==========
|
2 |
+
precision recall f1-score support
|
3 |
+
|
4 |
+
Anger 0.9768 0.9788 0.9778 991
|
5 |
+
Disgust 0.9457 0.9657 0.9556 991
|
6 |
+
Enjoyment 0.9166 0.8204 0.8658 991
|
7 |
+
Fear 0.9771 0.9879 0.9825 992
|
8 |
+
Other 0.9026 0.9253 0.9138 991
|
9 |
+
Sadness 0.9302 0.9677 0.9486 991
|
10 |
+
Surprise 0.9448 0.9496 0.9472 992
|
11 |
+
|
12 |
+
accuracy 0.9422 6939
|
13 |
+
macro avg 0.9420 0.9422 0.9416 6939
|
14 |
+
weighted avg 0.9420 0.9422 0.9416 6939
|
15 |
+
|
16 |
+
========== Confusion Matrix ==========
|
17 |
+
[[970 9 3 4 2 2 1]
|
18 |
+
[ 12 957 2 3 7 5 5]
|
19 |
+
[ 5 16 813 9 67 42 39]
|
20 |
+
[ 2 2 6 980 1 1 0]
|
21 |
+
[ 3 13 33 2 917 13 10]
|
22 |
+
[ 1 7 17 3 4 959 0]
|
23 |
+
[ 0 8 13 2 18 9 942]]
|
phobert_emotion_model/confusion_matrix.png
ADDED
![]() |
phobert_emotion_model/id2label.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"0": "Anger",
|
3 |
+
"1": "Disgust",
|
4 |
+
"2": "Enjoyment",
|
5 |
+
"3": "Fear",
|
6 |
+
"4": "Other",
|
7 |
+
"5": "Sadness",
|
8 |
+
"6": "Surprise"
|
9 |
+
}
|
phobert_emotion_model/phobert_emotion_model/added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<mask>": 64000
|
3 |
+
}
|
phobert_emotion_model/phobert_emotion_model/bpe.codes
ADDED
The diff for this file is too large to render.
See raw diff
|
|
phobert_emotion_model/phobert_emotion_model/config.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "vinai/phobert-base",
|
3 |
+
"architectures": [
|
4 |
+
"RobertaForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"gradient_checkpointing": false,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout_prob": 0.1,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"id2label": {
|
15 |
+
"0": "LABEL_0",
|
16 |
+
"1": "LABEL_1",
|
17 |
+
"2": "LABEL_2",
|
18 |
+
"3": "LABEL_3",
|
19 |
+
"4": "LABEL_4",
|
20 |
+
"5": "LABEL_5",
|
21 |
+
"6": "LABEL_6"
|
22 |
+
},
|
23 |
+
"initializer_range": 0.02,
|
24 |
+
"intermediate_size": 3072,
|
25 |
+
"label2id": {
|
26 |
+
"LABEL_0": 0,
|
27 |
+
"LABEL_1": 1,
|
28 |
+
"LABEL_2": 2,
|
29 |
+
"LABEL_3": 3,
|
30 |
+
"LABEL_4": 4,
|
31 |
+
"LABEL_5": 5,
|
32 |
+
"LABEL_6": 6
|
33 |
+
},
|
34 |
+
"layer_norm_eps": 1e-05,
|
35 |
+
"max_position_embeddings": 258,
|
36 |
+
"model_type": "roberta",
|
37 |
+
"num_attention_heads": 12,
|
38 |
+
"num_hidden_layers": 12,
|
39 |
+
"pad_token_id": 1,
|
40 |
+
"position_embedding_type": "absolute",
|
41 |
+
"problem_type": "single_label_classification",
|
42 |
+
"tokenizer_class": "PhobertTokenizer",
|
43 |
+
"torch_dtype": "float32",
|
44 |
+
"transformers_version": "4.40.0",
|
45 |
+
"type_vocab_size": 1,
|
46 |
+
"use_cache": true,
|
47 |
+
"vocab_size": 64001
|
48 |
+
}
|
phobert_emotion_model/phobert_emotion_model/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23cc285ab489e07145436eebb67247d71cd67c817155cc65eb5a7e52e78ed4f0
|
3 |
+
size 540038764
|
phobert_emotion_model/phobert_emotion_model/special_tokens_map.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"cls_token": "<s>",
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"mask_token": "<mask>",
|
6 |
+
"pad_token": "<pad>",
|
7 |
+
"sep_token": "</s>",
|
8 |
+
"unk_token": "<unk>"
|
9 |
+
}
|
phobert_emotion_model/phobert_emotion_model/tokenizer_config.json
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"64000": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "<s>",
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "<s>",
|
47 |
+
"eos_token": "</s>",
|
48 |
+
"mask_token": "<mask>",
|
49 |
+
"model_max_length": 1000000000000000019884624838656,
|
50 |
+
"pad_token": "<pad>",
|
51 |
+
"sep_token": "</s>",
|
52 |
+
"tokenizer_class": "PhobertTokenizer",
|
53 |
+
"unk_token": "<unk>"
|
54 |
+
}
|
phobert_emotion_model/phobert_emotion_model/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
phobert_results/checkpoint-10410/added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<mask>": 64000
|
3 |
+
}
|
phobert_results/checkpoint-10410/bpe.codes
ADDED
The diff for this file is too large to render.
See raw diff
|
|