File size: 11,287 Bytes

e09333c

# lstm_emotion_classifier.py
# -*- coding: utf-8 -*-

import re
import emoji
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from underthesea import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

########################
# TIỀN XỬ LÝ
########################

def replace_emojis(sentence, emoji_mapping):
    processed_sentence = []
    for char in sentence:
        if char in emoji_mapping:
            processed_sentence.append(emoji_mapping[char])
        elif not emoji.is_emoji(char):
            processed_sentence.append(char)
    return ''.join(processed_sentence)

def remove_profanity(sentence):
    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
    words = sentence.split()
    filtered = [w for w in words if w.lower() not in profane_words]
    return ' '.join(filtered)

def remove_special_characters(sentence):
    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)

def normalize_whitespace(sentence):
    return ' '.join(sentence.split())

def remove_repeated_characters(sentence):
    return re.sub(r"(.)\1{2,}", r"\1", sentence)

def replace_numbers(sentence):
    return re.sub(r"\d+", "[number]", sentence)

def tokenize_underthesea(sentence):
    tokens = word_tokenize(sentence)
    return " ".join(tokens)

def preprocess_sentence(sentence, abbreviations, emoji_mapping):
    sentence = sentence.lower()
    sentence = replace_emojis(sentence, emoji_mapping)
    sentence = remove_profanity(sentence)
    sentence = remove_special_characters(sentence)
    sentence = normalize_whitespace(sentence)
    # Thay thế viết tắt
    words = sentence.split()
    replaced = []
    for w in words:
        if w in abbreviations:
            replaced.append(" ".join(abbreviations[w]))
        else:
            replaced.append(w)
    sentence = " ".join(replaced)
    sentence = remove_repeated_characters(sentence)
    sentence = replace_numbers(sentence)
    # Tokenize tiếng Việt
    sentence = tokenize_underthesea(sentence)
    return sentence

emoji_mapping = {
    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
    "🤑": "[satisfaction]",
    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
    "😏": "[sarcasm]",
    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
}

def load_abbreviations(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

###################################
# MAIN
###################################
if __name__ == "__main__":
    file_path = "train.xlsx"
    abbreviations_path = "abbreviations.json"
    output_path = "processed_phobert.xlsx"

    abbreviations = load_abbreviations(abbreviations_path)

    df = pd.read_excel(file_path)
    if "Sentence" not in df.columns or "Emotion" not in df.columns:
        raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")

    # Tiền xử lý
    df["processed_sentence"] = df["Sentence"].apply(
        lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
    )
    # Loại bỏ rỗng
    df = df[df["processed_sentence"].str.strip().astype(bool)]

    print("Trước khi cân bằng:")
    print(df["Emotion"].value_counts())

    # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
    # Lấy max samples
    max_count = df["Emotion"].value_counts().max()

    df_balanced_list = []
    for emo in df["Emotion"].unique():
        df_emo = df[df["Emotion"] == emo]
        if len(df_emo) < max_count:
            # Oversample lên max_count
            df_emo_oversampled = resample(
                df_emo,
                replace=True,
                n_samples=max_count,
                random_state=42
            )
            df_balanced_list.append(df_emo_oversampled)
        else:
            # Nếu emo này = max_count rồi thì giữ nguyên
            df_balanced_list.append(df_emo)

    df = pd.concat(df_balanced_list, axis=0)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("\nSau khi cân bằng tất cả lớp:")
    print(df["Emotion"].value_counts())

    df.to_excel(output_path, index=False)

    # Tạo label2id và id2label theo thứ tự bạn cung cấp
    custom_id2label = {
        0: 'Anger',
        1: 'Disgust',
        2: 'Enjoyment',
        3: 'Fear',
        4: 'Other',
        5: 'Sadness',
        6: 'Surprise'
    }
    label2id = {label: idx for idx, label in enumerate(custom_id2label.values())}
    id2label = {v: k for k, v in label2id.items()}

    df["label_id"] = df["Emotion"].map(label2id)

    # Tách train/test
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
    print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")

    # Feature Extraction với Tokenizer và Padding
    tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
    tokenizer.fit_on_texts(train_df["processed_sentence"])

    X_train_seq = tokenizer.texts_to_sequences(train_df["processed_sentence"])
    X_test_seq = tokenizer.texts_to_sequences(test_df["processed_sentence"])

    max_length = 256
    X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
    X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

    y_train = train_df["label_id"].values
    y_test = test_df["label_id"].values

    # Chuyển đổi nhãn thành one-hot encoding
    num_classes = len(custom_id2label)
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

    # Xây dựng mô hình LSTM
    model = Sequential([
        Embedding(input_dim=5000, output_dim=128, input_length=max_length),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.summary()

    # Huấn luyện mô hình
    early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=1
    )

    # Đánh giá mô hình
    print("\n========== Evaluate on Test set ==========")
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy:.4f}")

    # Dự đoán và in báo cáo phân loại
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test, axis=1)

    # In Classification Report
    print("\nClassification Report:")
    report = classification_report(y_true, y_pred, target_names=custom_id2label.values())
    print(report)

    # Tính và in Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Vẽ Confusion Matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=custom_id2label.values(),
                yticklabels=custom_id2label.values())
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(os.path.join("lstm_emotion_model", "confusion_matrix.png"))
    plt.close()
    print("\nConfusion Matrix plot saved to 'lstm_emotion_model/confusion_matrix.png'")

    # Lưu Classification Report vào file
    report_path = os.path.join("lstm_emotion_model", "classification_report.txt")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write("========== Classification Report ==========\n")
        f.write(report)
        f.write("\n========== Confusion Matrix ==========\n")
        f.write(np.array2string(conf_matrix))

    print(f"\nClassification Report saved to '{report_path}'")

    # Lưu mô hình và tokenizer
    model_output_dir = "./lstm_emotion_model"
    os.makedirs(model_output_dir, exist_ok=True)
    model.save(os.path.join(model_output_dir, "lstm_emotion_model.h5"))
    joblib.dump(tokenizer, os.path.join(model_output_dir, "tokenizer.joblib"))
    with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
        json.dump(id2label, f, ensure_ascii=False, indent=4)

    print("\n========== Model and Tokenizer saved ==========")

    # Predict 1 câu (ví dụ)
    def predict_text(text):
        text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
        seq = tokenizer.texts_to_sequences([text_proc])
        padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
        pred_prob = model.predict(padded)
        pred_id = np.argmax(pred_prob, axis=1)[0]
        label = custom_id2label[pred_id]
        return label

    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
    emotion_pred = predict_text(custom_text)
    print("\nCâu ví dụ:", custom_text)
    print("Dự đoán cảm xúc:", emotion_pred)

    print("\nHoàn thành demo LSTM với cân bằng dữ liệu & nhiều epoch hơn!")