File size: 22,716 Bytes

e09333c

# thesis.py
# -*- coding: utf-8 -*-

import pandas as pd
import emoji
import json
import re
import numpy as np
from underthesea import word_tokenize
from tqdm import tqdm
import torch
from torchtext.vocab import Vectors
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
import os

# ========== CÁC HÀM TIỀN XỬ LÝ ==========

def preprocess_sentence(sentence, abbreviations, emoji_mapping):
    """
    Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục, 
    ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
    """
    sentence = sentence.lower()
    sentence = replace_emojis(sentence, emoji_mapping)
    sentence = remove_profanity(sentence)
    sentence = remove_special_characters(sentence)
    sentence = normalize_whitespace(sentence)
    sentence = replace_abbreviations(sentence, abbreviations)
    sentence = remove_repeated_characters(sentence)
    sentence = replace_numbers(sentence)
    sentence = tokenize_sentence(sentence)
    return sentence

def replace_emojis(sentence, emoji_mapping):
    processed_sentence = []
    for char in sentence:
        if char in emoji_mapping:
            processed_sentence.append(emoji_mapping[char])
        elif not emoji.is_emoji(char):
            processed_sentence.append(char)
    return ''.join(processed_sentence)

def remove_profanity(sentence):
    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in profane_words]
    return ' '.join(filtered_words)

def remove_special_characters(sentence):
    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)

def normalize_whitespace(sentence):
    return ' '.join(sentence.split())

def replace_abbreviations(sentence, abbreviations):
    words = sentence.split()
    replaced_words = [
        " ".join(abbreviations[word]) if word in abbreviations else word
        for word in words
    ]
    return ' '.join(replaced_words)

def remove_repeated_characters(sentence):
    # Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
    return re.sub(r"(.)\1{2,}", r"\1", sentence)

def replace_numbers(sentence):
    # Thay toàn bộ số bằng token [number]
    return re.sub(r"\d+", "[number]", sentence)

def tokenize_sentence(sentence):
    # Tách từ bằng underthesea
    return ' '.join(word_tokenize(sentence))

# ========== VOCABULARY CLASS ==========

class Vocabulary:
    def __init__(self):
        self.word2id = {}
        self.word2id['<pad>'] = 0
        self.word2id['<unk>'] = 1
        self.unk_id = 1
        self.id2word = {0: '<pad>', 1: '<unk>'}

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __len__(self):
        return len(self.word2id)

    def lookup_tokens(self, indices):
        return [self.id2word[idx] for idx in indices]

    def add(self, word):
        if word not in self.word2id:
            idx = len(self.word2id)
            self.word2id[word] = idx
            self.id2word[idx] = word

    @staticmethod
    def tokenize_corpus(corpus):
        tokenized_corpus = []
        for doc in tqdm(corpus, desc="Tokenizing Corpus"):
            tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
            tokenized_corpus.append(tokens)
        return tokenized_corpus

    def corpus_to_tensor(self, corpus, is_tokenized=False):
        """
        corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
        return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
        """
        tokenized_corpus = (
            self.tokenize_corpus(corpus) if not is_tokenized else corpus
        )
        return [
            [self[token] for token in doc]
            for doc in tokenized_corpus
        ]

# ========== EMOJI MAPPING ==========

emoji_mapping = {
    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
    "🤑": "[satisfaction]",
    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
    "😏": "[sarcasm]",
    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
}

# ========== DATA MANAGER ==========

class DataManager:
    def __init__(self, file_path, abbreviations_path, word2vec_path):
        self.file_path = file_path
        self.abbreviations_path = abbreviations_path
        self.word2vec_path = word2vec_path
        self.vocabulary = None
        self.word_embeddings = None
        self.abbreviations = None
        self.load_abbreviations()

    def load_abbreviations(self):
        with open(self.abbreviations_path, "r", encoding="utf-8") as f:
            self.abbreviations = json.load(f)

    def load_word2vec(self):
        """
        Tải vector từ file word2vec, 
        dùng torchtext.Vectors để load embedding pretrained.
        """
        self.word_embeddings = Vectors(
            name=self.word2vec_path, 
            unk_init=torch.Tensor.normal_
        )

    def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
        """
        Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
        """
        vocab = Vocabulary()
        from collections import Counter
        counter = Counter()

        for sent in corpus:
            for token in sent.split():
                counter[token] += 1

        most_common = counter.most_common(max_vocab_size)
        for word, _freq in most_common:
            vocab.add(word)

        return vocab

    def preprocess_data(self):
        df = pd.read_excel(self.file_path)
        if "Sentence" not in df.columns:
            raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")

        # Tiền xử lý từng câu
        df["processed_sentence"] = df["Sentence"].apply(
            lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
        )

        # Loại những dòng rỗng
        df = df[df["processed_sentence"].str.strip().astype(bool)]

        # Tạo vocab từ chính dữ liệu
        all_sentences = df["processed_sentence"].tolist()
        self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)

        # Load word2vec
        self.load_word2vec()

        return df

    def build_pretrained_embedding_matrix(self, embedding_dim=100):
        """
        Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
        với trọng số pretrained. 
        """
        vocab_size = len(self.vocabulary)
        weight_matrix = np.random.normal(
            scale=0.1, size=(vocab_size, embedding_dim)
        ).astype(np.float32)

        # Copy vector pretrained
        for word, idx in self.vocabulary.word2id.items():
            if word in self.word_embeddings.stoi:
                weight_matrix[idx] = self.word_embeddings.vectors[
                    self.word_embeddings.stoi[word]
                ]

        return weight_matrix

    def split_and_convert(
        self, df, label_column="Emotion", maxlen=400, test_size=0.2,
        for_keras=False, batch_size=32
    ):
        """
        Chia dữ liệu thành train/test hoặc train/val/test. 
        - for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
        - for_keras=True  → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
        """
        if label_column not in df.columns:
            raise ValueError(
                f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
            )

        # Tạo mapping nhãn -> số
        label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
        df[label_column] = df[label_column].map(label_mapping)
        if df[label_column].isnull().any():
            missing = df[df[label_column].isnull()][label_column].unique()
            raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")

        X = df["processed_sentence"].tolist()
        y = df[label_column].tolist()

        # Stratify để duy trì phân phối lớp
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )

        if not for_keras:
            # Chia train thành train và validation
            X_train, X_val, y_train, y_val = train_test_split(
                X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
            )

        # Convert text -> index
        X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
        X_test_ids  = self.vocabulary.corpus_to_tensor(X_test,  is_tokenized=False)

        if not for_keras:
            X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)

        # Pad
        X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
        X_test_padded  = pad_sequences(X_test_ids,  maxlen=maxlen, padding='post', truncating='post')

        if not for_keras:
            X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')

        print(">>> Debug Split and Convert:")
        print("X_train_padded.shape:", X_train_padded.shape)
        print("X_val_padded.shape: ", X_val_padded.shape if not for_keras else "N/A")
        print("X_test_padded.shape: ", X_test_padded.shape)
        print("y_train length:", len(y_train))
        print("y_val length: ", len(y_val) if not for_keras else "N/A")
        print("y_test length: ", len(y_test))
        print("vocab_size:", len(self.vocabulary))

        if for_keras:
            num_classes = len(label_mapping)
            y_train_onehot = tf.keras.utils.to_categorical(
                y_train, 
                num_classes=num_classes
            )
            y_test_onehot  = tf.keras.utils.to_categorical(
                y_test,
                num_classes=num_classes
            )

            print("y_train_onehot.shape:", y_train_onehot.shape)
            print("y_test_onehot.shape: ", y_test_onehot.shape)

            return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
        else:
            # Convert validation set
            X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
            X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')

            X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
            X_val_t   = torch.tensor(X_val_padded, dtype=torch.long)
            X_test_t  = torch.tensor(X_test_padded,  dtype=torch.long)
            y_train_t = torch.tensor(y_train, dtype=torch.long)
            y_val_t   = torch.tensor(y_val, dtype=torch.long)
            y_test_t  = torch.tensor(y_test,  dtype=torch.long)

            train_ds = TensorDataset(X_train_t, y_train_t)
            val_ds   = TensorDataset(X_val_t, y_val_t)
            test_ds  = TensorDataset(X_test_t,  y_test_t)

            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
            val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
            test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

            return train_loader, val_loader, test_loader, label_mapping

# ========== MÔ HÌNH KERAS BI-LSTM ==========

def predict_emotion_bilstm(model, text, data_manager, label_mapping):
    processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
    tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
    text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
    text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
    output = model.predict(text_padded)
    pred = output.argmax(axis=1)[0]
    rev_map = {v: k for k, v in label_mapping.items()}
    return rev_map[pred]

# ========== MAIN ==========

if __name__ == "__main__":
    from keras.models import Model
    from keras.layers import (
        Input, Embedding, Dense, Dropout, Bidirectional, LSTM
    )
    from keras.optimizers import Adam
    from keras.callbacks import ModelCheckpoint, EarlyStopping

    # -------- ĐƯỜNG DẪN ----------
    file_path = "train.xlsx"
    abbreviations_path = "abbreviations.json"
    word2vec_path = "word2vec_vi_syllables_100dims.txt"
    output_path = "processed.xlsx"

    # Khởi tạo DataManager
    data_manager = DataManager(
        file_path=file_path,
        abbreviations_path=abbreviations_path,
        word2vec_path=word2vec_path
    )

    # 1) Tiền xử lý, tạo vocab, load word2vec
    df = data_manager.preprocess_data()
    print("Trước khi cân bằng lớp (undersampling/oversampling):")
    print(df["Emotion"].value_counts())

    # 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
    # Bạn có thể điều chỉnh theo nhu cầu của mình
    df_enjoyment = df[df["Emotion"] == "Enjoyment"]
    df_other     = df[df["Emotion"] == "Other"]
    df_anger     = df[df["Emotion"] == "Anger"]
    df_sadness   = df[df["Emotion"] == "Sadness"]
    df_disgust   = df[df["Emotion"] == "Disgust"]
    df_fear      = df[df["Emotion"] == "Fear"]
    df_surprise  = df[df["Emotion"] == "Surprise"]

    # Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
    if len(df_other) < 3000:
        df_other_oversampled = resample(
            df_other, 
            replace=True,
            n_samples=3000,
            random_state=42
        )
    else:
        df_other_oversampled = df_other

    # Giữ nguyên các lớp khác (hoặc oversample tùy ý)
    df_balanced = pd.concat([
        df_enjoyment,
        df_other_oversampled,
        df_anger,
        df_sadness,
        df_disgust,
        df_fear,
        df_surprise
    ], axis=0)

    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    df = df_balanced

    print("\nSau khi cân bằng lớp (demo oversample):")
    print(df["Emotion"].value_counts())

    # Xuất file (nếu muốn)
    df.to_excel(output_path, index=False)

    # ========== TRAIN BI-LSTM KERAS ==========

    print("\n========== Training Keras BiLSTM ==========")

    # Tạo embedding pretrained cho Keras
    pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
    pretrained_matrix_keras = pretrained_matrix.astype(np.float32)

    # Split data for Keras
    X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
        df, label_column="Emotion", maxlen=400,
        test_size=0.2, for_keras=True
    )

    num_classes = len(label_mapping)
    input_dim = len(data_manager.vocabulary)
    embedding_dim = pretrained_matrix.shape[1]
    maxlen = 400

    # Define BiLSTM Model
    def create_bilstm_model():
        input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
        emb_layer = Embedding(
            input_dim=input_dim,
            output_dim=embedding_dim,
            weights=[pretrained_matrix_keras],
            input_length=maxlen,
            trainable=True  # Set to False nếu bạn không muốn fine-tune embeddings
        )(input_layer)

        bilstm = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5))(emb_layer)
        dense1 = Dense(64, activation='relu')(bilstm)
        dropout1 = Dropout(0.5)(dense1)
        dense2 = Dense(32, activation='relu')(dropout1)
        dropout2 = Dropout(0.5)(dense2)
        output_layer = Dense(num_classes, activation='softmax')(dropout2)

        model = Model(inputs=input_layer, outputs=output_layer)
        model.compile(
            loss='categorical_crossentropy',
            optimizer=Adam(lr=1e-3),
            metrics=['accuracy']
        )
        return model

    # Create model
    model_bilstm = create_bilstm_model()
    model_bilstm.summary()

    # Define callbacks
    checkpoint = ModelCheckpoint(
        'bilstm_best.keras',
        save_best_only=True,
        monitor='val_accuracy',
        mode='max'
    )
    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        patience=5,
        restore_best_weights=True
    )

    # Train model
    history = model_bilstm.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[checkpoint, early_stopping]
    )

    # Đánh giá trên test set với detailed metrics
    loss, acc = model_bilstm.evaluate(X_test, y_test)
    print(f"BiLSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

    # Thu thập dự đoán và tính toán các chỉ số
    y_pred_bilstm = model_bilstm.predict(X_test)
    y_pred_bilstm = np.argmax(y_pred_bilstm, axis=1)
    y_true_bilstm = np.argmax(y_test, axis=1)

    test_accuracy_bilstm = accuracy_score(y_true_bilstm, y_pred_bilstm)
    precision_macro_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
    precision_weighted_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
    recall_macro_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
    recall_weighted_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
    f1_macro_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
    f1_weighted_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
    report_bilstm = classification_report(y_true_bilstm, y_pred_bilstm, target_names=label_mapping.keys(), digits=4)
    conf_matrix_bilstm = confusion_matrix(y_true_bilstm, y_pred_bilstm)

    # In các chỉ số
    print(f"\nBiLSTM Test Accuracy: {test_accuracy_bilstm:.4f}")
    print(f"Precision (Macro): {precision_macro_bilstm:.4f}")
    print(f"Precision (Weighted): {precision_weighted_bilstm:.4f}")
    print(f"Recall (Macro): {recall_macro_bilstm:.4f}")
    print(f"Recall (Weighted): {recall_weighted_bilstm:.4f}")
    print(f"F1-Score (Macro): {f1_macro_bilstm:.4f}")
    print(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}")

    print("\n========== BiLSTM Classification Report ==========")
    print(report_bilstm)

    print("\n========== BiLSTM Confusion Matrix ==========")
    print(conf_matrix_bilstm)

    # Lưu báo cáo vào file
    bilstm_report_dir = "bilstm_emotion_model"
    os.makedirs(bilstm_report_dir, exist_ok=True)
    with open(os.path.join(bilstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
        f.write("========== BiLSTM Classification Report ==========\n")
        f.write(report_bilstm)
        f.write("\n========== Additional Metrics ==========\n")
        f.write(f"Test Loss: {loss:.4f}\n")
        f.write(f"Test Accuracy: {test_accuracy_bilstm:.4f}\n")
        f.write(f"Precision (Macro): {precision_macro_bilstm:.4f}\n")
        f.write(f"Precision (Weighted): {precision_weighted_bilstm:.4f}\n")
        f.write(f"Recall (Macro): {recall_macro_bilstm:.4f}\n")
        f.write(f"Recall (Weighted): {recall_weighted_bilstm:.4f}\n")
        f.write(f"F1-Score (Macro): {f1_macro_bilstm:.4f}\n")
        f.write(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}\n")
        f.write("\n========== Confusion Matrix ==========\n")
        f.write(np.array2string(conf_matrix_bilstm))

    print("\n========== BiLSTM Classification Report saved to 'bilstm_emotion_model/classification_report.txt' ==========")

    # Lưu mô hình BiLSTM
    model_bilstm.save(os.path.join(bilstm_report_dir, 'bilstm_model.keras'))
    print(f"========== BiLSTM Model saved to '{bilstm_report_dir}/bilstm_model.keras' ==========")

    # ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
    
    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"

    # BiLSTM (Keras)
    emotion_bilstm = predict_emotion_bilstm(
        model_bilstm, custom_text, data_manager, label_mapping
    )
    print(f"Predicted Emotion (BiLSTM): {emotion_bilstm}")

    # Kiểm tra TF, GPU
    print("TF version:", tf.__version__)
    print("GPU devices:", tf.config.list_physical_devices("GPU"))
    # os.system("nvidia-smi")  # nếu muốn xem info GPU

    # ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
    # Lưu label_mapping và vocabulary cho BiLSTM
    with open(os.path.join(bilstm_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
        json.dump(label_mapping, f, ensure_ascii=False, indent=4)

    with open(os.path.join(bilstm_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
        json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)

    print("========== Label Mapping and Vocabulary saved ==========")