ducdatit2002's picture
Upload folder using huggingface_hub
e09333c verified
raw
history blame
10.8 kB
# svm_emotion_classifier.py
# -*- coding: utf-8 -*-
import re
import emoji
import json
import pandas as pd
import numpy as np
import torch # Có thể không cần thiết cho SVM, nhưng giữ lại nếu cần
from underthesea import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
accuracy_score,
classification_report,
precision_score,
recall_score,
f1_score,
confusion_matrix
)
from sklearn.utils import resample
import joblib
import os
########################
# TIỀN XỬ LÝ
########################
def replace_emojis(sentence, emoji_mapping):
processed_sentence = []
for char in sentence:
if char in emoji_mapping:
processed_sentence.append(emoji_mapping[char])
elif not emoji.is_emoji(char):
processed_sentence.append(char)
return ''.join(processed_sentence)
def remove_profanity(sentence):
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
words = sentence.split()
filtered = [w for w in words if w.lower() not in profane_words]
return ' '.join(filtered)
def remove_special_characters(sentence):
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
def normalize_whitespace(sentence):
return ' '.join(sentence.split())
def remove_repeated_characters(sentence):
return re.sub(r"(.)\1{2,}", r"\1", sentence)
def replace_numbers(sentence):
return re.sub(r"\d+", "[number]", sentence)
def tokenize_underthesea(sentence):
tokens = word_tokenize(sentence)
return " ".join(tokens)
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
sentence = sentence.lower()
sentence = replace_emojis(sentence, emoji_mapping)
sentence = remove_profanity(sentence)
sentence = remove_special_characters(sentence)
sentence = normalize_whitespace(sentence)
# Thay thế viết tắt
words = sentence.split()
replaced = []
for w in words:
if w in abbreviations:
replaced.append(" ".join(abbreviations[w]))
else:
replaced.append(w)
sentence = " ".join(replaced)
sentence = remove_repeated_characters(sentence)
sentence = replace_numbers(sentence)
# Tokenize tiếng Việt
sentence = tokenize_underthesea(sentence)
return sentence
emoji_mapping = {
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
"🤑": "[satisfaction]",
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
"😏": "[sarcasm]",
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
}
def load_abbreviations(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
###################################
# MAIN
###################################
if __name__ == "__main__":
file_path = "train.xlsx"
abbreviations_path = "abbreviations.json"
output_path = "processed_svm.xlsx" # Changed output filename to reflect SVM
abbreviations = load_abbreviations(abbreviations_path)
df = pd.read_excel(file_path)
if "Sentence" not in df.columns or "Emotion" not in df.columns:
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
# Tiền xử lý
df["processed_sentence"] = df["Sentence"].apply(
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
)
# Loại bỏ rỗng
df = df[df["processed_sentence"].str.strip().astype(bool)]
print("Trước khi cân bằng:")
print(df["Emotion"].value_counts())
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
# Lấy max samples
max_count = df["Emotion"].value_counts().max()
df_balanced_list = []
for emo in df["Emotion"].unique():
df_emo = df[df["Emotion"] == emo]
if len(df_emo) < max_count:
# Oversample lên max_count
df_emo_oversampled = resample(
df_emo,
replace=True,
n_samples=max_count,
random_state=42
)
df_balanced_list.append(df_emo_oversampled)
else:
# Nếu emo này = max_count rồi thì giữ nguyên
df_balanced_list.append(df_emo)
df = pd.concat(df_balanced_list, axis=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("\nSau khi cân bằng tất cả lớp:")
print(df["Emotion"].value_counts())
df.to_excel(output_path, index=False)
# Tạo label2id và id2label theo thứ tự bạn cung cấp
custom_id2label = {
0: 'Anger',
1: 'Disgust',
2: 'Enjoyment',
3: 'Fear',
4: 'Other',
5: 'Sadness',
6: 'Surprise'
}
label2id = {label: idx for idx, label in custom_id2label.items()}
id2label = {v: k for k, v in label2id.items()}
df["label_id"] = df["Emotion"].map(label2id)
if df["label_id"].isnull().any():
missing = df[df["label_id"].isnull()]["Emotion"].unique()
raise ValueError(f"Những nhãn cảm xúc sau không có trong label2id: {missing}")
# Tách train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
# Feature Extraction với TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df["processed_sentence"])
X_test = vectorizer.transform(test_df["processed_sentence"])
y_train = train_df["label_id"].values
y_test = test_df["label_id"].values
# Huấn luyện mô hình SVM
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
print("\n========== Training SVM ==========")
svm_classifier.fit(X_train, y_train)
# Đánh giá mô hình
print("\n========== Evaluate on Test set ==========")
y_pred = svm_classifier.predict(X_test)
# Tính các chỉ số
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)
# In các chỉ số
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision (Macro): {precision_macro:.4f}")
print(f"Precision (Weighted): {precision_weighted:.4f}")
print(f"Recall (Macro): {recall_macro:.4f}")
print(f"Recall (Weighted): {recall_weighted:.4f}")
print(f"F1-Score (Macro): {f1_macro:.4f}")
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
print("\n========== Classification Report ==========")
report = classification_report(y_test, y_pred, target_names=custom_id2label.values(), digits=4)
print(report)
# Lưu báo cáo vào file
report_path = os.path.join("svm_emotion_model", "classification_report.txt")
os.makedirs(os.path.dirname(report_path), exist_ok=True)
with open(report_path, "w", encoding="utf-8") as f:
f.write("========== Classification Report ==========\n")
f.write(report)
f.write("\n========== Additional Metrics ==========\n")
f.write(f"Accuracy: {accuracy:.4f}\n")
f.write(f"Precision (Macro): {precision_macro:.4f}\n")
f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
f.write(f"Recall (Macro): {recall_macro:.4f}\n")
f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
f.write("\n========== Confusion Matrix ==========\n")
f.write(np.array2string(conf_matrix))
print("\n========== Classification Report saved to 'svm_emotion_model/classification_report.txt' ==========")
# Lưu mô hình và các thành phần cần thiết
model_output_dir = "./svm_emotion_model"
os.makedirs(model_output_dir, exist_ok=True)
joblib.dump(svm_classifier, os.path.join(model_output_dir, "svm_classifier.joblib"))
joblib.dump(vectorizer, os.path.join(model_output_dir, "tfidf_vectorizer.joblib"))
joblib.dump(id2label, os.path.join(model_output_dir, "id2label.json"))
print("\n========== Model and Vectorizer saved ==========")
# Predict 1 câu (ví dụ)
def predict_text(text):
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
X = vectorizer.transform([text_proc])
pred_id = svm_classifier.predict(X)[0]
label = custom_id2label[pred_id]
return label
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
emotion_pred = predict_text(custom_text)
print("\nCâu ví dụ:", custom_text)
print("Dự đoán cảm xúc:", emotion_pred)
print("\nHoàn thành demo SVM với cân bằng dữ liệu & nhiều chỉ số đánh giá!")