# svm_emotion_classifier.py # -*- coding: utf-8 -*- import re import emoji import json import pandas as pd import numpy as np import torch # Có thể không cần thiết cho SVM, nhưng giữ lại nếu cần from underthesea import word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import ( accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix ) from sklearn.utils import resample import joblib import os ######################## # TIỀN XỬ LÝ ######################## def replace_emojis(sentence, emoji_mapping): processed_sentence = [] for char in sentence: if char in emoji_mapping: processed_sentence.append(emoji_mapping[char]) elif not emoji.is_emoji(char): processed_sentence.append(char) return ''.join(processed_sentence) def remove_profanity(sentence): profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"] words = sentence.split() filtered = [w for w in words if w.lower() not in profane_words] return ' '.join(filtered) def remove_special_characters(sentence): return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence) def normalize_whitespace(sentence): return ' '.join(sentence.split()) def remove_repeated_characters(sentence): return re.sub(r"(.)\1{2,}", r"\1", sentence) def replace_numbers(sentence): return re.sub(r"\d+", "[number]", sentence) def tokenize_underthesea(sentence): tokens = word_tokenize(sentence) return " ".join(tokens) def preprocess_sentence(sentence, abbreviations, emoji_mapping): sentence = sentence.lower() sentence = replace_emojis(sentence, emoji_mapping) sentence = remove_profanity(sentence) sentence = remove_special_characters(sentence) sentence = normalize_whitespace(sentence) # Thay thế viết tắt words = sentence.split() replaced = [] for w in words: if w in abbreviations: replaced.append(" ".join(abbreviations[w])) else: replaced.append(w) sentence = " ".join(replaced) sentence = remove_repeated_characters(sentence) sentence = replace_numbers(sentence) # Tokenize tiếng Việt sentence = tokenize_underthesea(sentence) return sentence emoji_mapping = { "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]", "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]", "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]", "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]", "🤑": "[satisfaction]", "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]", "😏": "[sarcasm]", "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]", "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]", "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]", "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]", "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]", "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]", "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]", "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]", "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]", "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]" } def load_abbreviations(path): with open(path, "r", encoding="utf-8") as f: return json.load(f) ################################### # MAIN ################################### if __name__ == "__main__": file_path = "train.xlsx" abbreviations_path = "abbreviations.json" output_path = "processed_svm.xlsx" # Changed output filename to reflect SVM abbreviations = load_abbreviations(abbreviations_path) df = pd.read_excel(file_path) if "Sentence" not in df.columns or "Emotion" not in df.columns: raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!") # Tiền xử lý df["processed_sentence"] = df["Sentence"].apply( lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping) ) # Loại bỏ rỗng df = df[df["processed_sentence"].str.strip().astype(bool)] print("Trước khi cân bằng:") print(df["Emotion"].value_counts()) # =========== CÂN BẰNG TẤT CẢ CÁC LỚP ============= # Lấy max samples max_count = df["Emotion"].value_counts().max() df_balanced_list = [] for emo in df["Emotion"].unique(): df_emo = df[df["Emotion"] == emo] if len(df_emo) < max_count: # Oversample lên max_count df_emo_oversampled = resample( df_emo, replace=True, n_samples=max_count, random_state=42 ) df_balanced_list.append(df_emo_oversampled) else: # Nếu emo này = max_count rồi thì giữ nguyên df_balanced_list.append(df_emo) df = pd.concat(df_balanced_list, axis=0) df = df.sample(frac=1, random_state=42).reset_index(drop=True) print("\nSau khi cân bằng tất cả lớp:") print(df["Emotion"].value_counts()) df.to_excel(output_path, index=False) # Tạo label2id và id2label theo thứ tự bạn cung cấp custom_id2label = { 0: 'Anger', 1: 'Disgust', 2: 'Enjoyment', 3: 'Fear', 4: 'Other', 5: 'Sadness', 6: 'Surprise' } label2id = {label: idx for idx, label in custom_id2label.items()} id2label = {v: k for k, v in label2id.items()} df["label_id"] = df["Emotion"].map(label2id) if df["label_id"].isnull().any(): missing = df[df["label_id"].isnull()]["Emotion"].unique() raise ValueError(f"Những nhãn cảm xúc sau không có trong label2id: {missing}") # Tách train/test train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"]) print(f"Train size = {len(train_df)}, Test size = {len(test_df)}") # Feature Extraction với TF-IDF vectorizer = TfidfVectorizer(max_features=5000) X_train = vectorizer.fit_transform(train_df["processed_sentence"]) X_test = vectorizer.transform(test_df["processed_sentence"]) y_train = train_df["label_id"].values y_test = test_df["label_id"].values # Huấn luyện mô hình SVM svm_classifier = SVC(kernel='linear', probability=True, random_state=42) print("\n========== Training SVM ==========") svm_classifier.fit(X_train, y_train) # Đánh giá mô hình print("\n========== Evaluate on Test set ==========") y_pred = svm_classifier.predict(X_test) # Tính các chỉ số accuracy = accuracy_score(y_test, y_pred) precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0) precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0) recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0) recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0) f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0) f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0) conf_matrix = confusion_matrix(y_test, y_pred) # In các chỉ số print(f"Test Accuracy: {accuracy:.4f}") print(f"Precision (Macro): {precision_macro:.4f}") print(f"Precision (Weighted): {precision_weighted:.4f}") print(f"Recall (Macro): {recall_macro:.4f}") print(f"Recall (Weighted): {recall_weighted:.4f}") print(f"F1-Score (Macro): {f1_macro:.4f}") print(f"F1-Score (Weighted): {f1_weighted:.4f}") print("\n========== Classification Report ==========") report = classification_report(y_test, y_pred, target_names=custom_id2label.values(), digits=4) print(report) # Lưu báo cáo vào file report_path = os.path.join("svm_emotion_model", "classification_report.txt") os.makedirs(os.path.dirname(report_path), exist_ok=True) with open(report_path, "w", encoding="utf-8") as f: f.write("========== Classification Report ==========\n") f.write(report) f.write("\n========== Additional Metrics ==========\n") f.write(f"Accuracy: {accuracy:.4f}\n") f.write(f"Precision (Macro): {precision_macro:.4f}\n") f.write(f"Precision (Weighted): {precision_weighted:.4f}\n") f.write(f"Recall (Macro): {recall_macro:.4f}\n") f.write(f"Recall (Weighted): {recall_weighted:.4f}\n") f.write(f"F1-Score (Macro): {f1_macro:.4f}\n") f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n") f.write("\n========== Confusion Matrix ==========\n") f.write(np.array2string(conf_matrix)) print("\n========== Classification Report saved to 'svm_emotion_model/classification_report.txt' ==========") # Lưu mô hình và các thành phần cần thiết model_output_dir = "./svm_emotion_model" os.makedirs(model_output_dir, exist_ok=True) joblib.dump(svm_classifier, os.path.join(model_output_dir, "svm_classifier.joblib")) joblib.dump(vectorizer, os.path.join(model_output_dir, "tfidf_vectorizer.joblib")) joblib.dump(id2label, os.path.join(model_output_dir, "id2label.json")) print("\n========== Model and Vectorizer saved ==========") # Predict 1 câu (ví dụ) def predict_text(text): text_proc = preprocess_sentence(text, abbreviations, emoji_mapping) X = vectorizer.transform([text_proc]) pred_id = svm_classifier.predict(X)[0] label = custom_id2label[pred_id] return label custom_text = "Tôi rất vui khi sử dụng dịch vụ này!" emotion_pred = predict_text(custom_text) print("\nCâu ví dụ:", custom_text) print("Dự đoán cảm xúc:", emotion_pred) print("\nHoàn thành demo SVM với cân bằng dữ liệu & nhiều chỉ số đánh giá!")