File size: 19,520 Bytes

e09333c

# thesis.py
# -*- coding: utf-8 -*-

import pandas as pd
import emoji
import json
import re
from underthesea import word_tokenize
from tqdm import tqdm
import torch
from torchtext.vocab import Vectors
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import numpy as np
import tensorflow as tf

# ========== CÁC HÀM TIỀN XỬ LÝ ==========

def preprocess_sentence(sentence, abbreviations, emoji_mapping):
    """
    Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục, 
    ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
    """
    sentence = sentence.lower()
    sentence = replace_emojis(sentence, emoji_mapping)
    sentence = remove_profanity(sentence)
    sentence = remove_special_characters(sentence)
    sentence = normalize_whitespace(sentence)
    sentence = replace_abbreviations(sentence, abbreviations)
    sentence = remove_repeated_characters(sentence)
    sentence = replace_numbers(sentence)
    sentence = tokenize_sentence(sentence)
    return sentence

def replace_emojis(sentence, emoji_mapping):
    processed_sentence = []
    for char in sentence:
        if char in emoji_mapping:
            processed_sentence.append(emoji_mapping[char])
        elif not emoji.is_emoji(char):
            processed_sentence.append(char)
    return ''.join(processed_sentence)

def remove_profanity(sentence):
    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in profane_words]
    return ' '.join(filtered_words)

def remove_special_characters(sentence):
    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)

def normalize_whitespace(sentence):
    return ' '.join(sentence.split())

def replace_abbreviations(sentence, abbreviations):
    words = sentence.split()
    replaced_words = [
        " ".join(abbreviations[word]) if word in abbreviations else word
        for word in words
    ]
    return ' '.join(replaced_words)

def remove_repeated_characters(sentence):
    return re.sub(r"(.)\1{2,}", r"\1", sentence)

def replace_numbers(sentence):
    return re.sub(r"\d+", "[number]", sentence)

def tokenize_sentence(sentence):
    return ' '.join(word_tokenize(sentence))


# ========== LỚP DATA MANAGER ==========

class DataManager:
    def __init__(self, file_path, abbreviations_path, word2vec_path):
        self.file_path = file_path
        self.abbreviations_path = abbreviations_path
        self.word2vec_path = word2vec_path
        self.load_abbreviations()
        self.load_word2vec()

    def load_abbreviations(self):
        with open(self.abbreviations_path, "r", encoding="utf-8") as file:
            self.abbreviations = json.load(file)

    def load_word2vec(self):
        # Tải vector từ file word2vec, unk_init để từ vựng ngoài tập sẽ random normal
        self.word_embeddings = Vectors(name=self.word2vec_path, unk_init=torch.Tensor.normal_)
        self.vocabulary = self.create_vocab_from_word2vec()

    def create_vocab_from_word2vec(self):
        vocab = Vocabulary()
        words_list = list(self.word_embeddings.stoi.keys())
        for word in words_list:
            vocab.add(word)
        return vocab

    def preprocess_data(self):
        df = pd.read_excel(self.file_path)
        if "Sentence" not in df.columns:
            raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
        
        # Tiền xử lý từng câu
        df["processed_sentence"] = df["Sentence"].apply(
            lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
        )
        
        # Loại bỏ những dòng rỗng sau khi xử lý
        df = df[df["processed_sentence"].str.strip().astype(bool)]
        return df

    def split_and_convert(
        self, df, label_column="Emotion", maxlen=400, test_size=0.2,
        for_keras=False, batch_size=32
    ):
        """
        Chia dữ liệu thành train/test. Trả về:
        - Nếu for_keras=False: train_loader, test_loader, label_mapping (PyTorch)
        - Nếu for_keras=True: X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
        """

        if label_column not in df.columns:
            raise ValueError(
                f"Cột '{label_column}' không tồn tại trong DataFrame. "
                f"Các cột hiện có: {df.columns.tolist()}"
            )

        # Tạo mapping nhãn -> số
        label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
        df[label_column] = df[label_column].map(label_mapping)

        X = df["processed_sentence"].tolist()
        y = df[label_column].tolist()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        # Chuyển văn bản thành tensor chỉ số
        X_train_tensors = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
        X_test_tensors  = self.vocabulary.corpus_to_tensor(X_test,  is_tokenized=False)

        # Pad sequences
        X_train_padded = pad_sequences(X_train_tensors, maxlen=maxlen)
        X_test_padded  = pad_sequences(X_test_tensors,  maxlen=maxlen)

        # Debug thông tin
        print(">>> Debug Split and Convert:")
        print("X_train_padded.shape:", X_train_padded.shape)
        print("X_test_padded.shape: ", X_test_padded.shape)
        print("y_train length:", len(y_train))
        print("y_test length: ", len(y_test))

        # Kiểm tra min/max token
        max_token_train = np.max(X_train_padded) if X_train_padded.size > 0 else None
        min_token_train = np.min(X_train_padded) if X_train_padded.size > 0 else None
        max_token_test  = np.max(X_test_padded)  if X_test_padded.size  > 0 else None
        min_token_test  = np.min(X_test_padded)  if X_test_padded.size  > 0 else None

        vocab_size = len(self.vocabulary)
        print(f"vocab_size: {vocab_size}")
        print(f"max_token_train: {max_token_train}, min_token_train: {min_token_train}")
        print(f"max_token_test:  {max_token_test},  min_token_test:  {min_token_test}")

        if for_keras:
            num_classes = len(label_mapping)
            # One-hot cho nhãn
            y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train), num_classes=num_classes).numpy()
            y_test_onehot  = torch.nn.functional.one_hot(torch.tensor(y_test),  num_classes=num_classes).numpy()

            # Debug
            print("y_train_onehot.shape:", y_train_onehot.shape)
            print("y_test_onehot.shape: ", y_test_onehot.shape)

            return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
        else:
            # Trả về DataLoader cho PyTorch
            X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
            X_test_tensor  = torch.tensor(X_test_padded,  dtype=torch.long)
            y_train_tensor = torch.tensor(y_train, dtype=torch.long)
            y_test_tensor  = torch.tensor(y_test,  dtype=torch.long)

            train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
            test_dataset  = TensorDataset(X_test_tensor,  y_test_tensor)

            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)
            return train_loader, test_loader, label_mapping


# ========== LỚP TỪ ĐIỂN (VOCABULARY) ==========

class Vocabulary:
    def __init__(self):
        self.word2id = {}
        self.word2id['<pad>'] = 0
        self.word2id['<unk>'] = 1
        self.unk_id = self.word2id['<unk>']
        self.id2word = {0: '<pad>', 1: '<unk>'}

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __len__(self):
        return len(self.word2id)

    def lookup_tokens(self, word_indexes: list):
        return [self.id2word[word_index] for word_index in word_indexes]

    def add(self, word):
        if word not in self:
            word_index = len(self.word2id)
            self.word2id[word] = word_index
            self.id2word[word_index] = word
            return word_index
        else:
            return self[word]

    @staticmethod
    def tokenize_corpus(corpus):
        tokenized_corpus = []
        for document in tqdm(corpus):
            tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
            tokenized_corpus.append(tokenized_document)
        return tokenized_corpus

    def corpus_to_tensor(self, corpus, is_tokenized=False):
        tokenized_corpus = self.tokenize_corpus(corpus) if not is_tokenized else corpus
        return [
            [self[word] for word in document]
            for document in tokenized_corpus
        ]


# ========== MAPPING EMOJI => NHÃN ==========

emoji_mapping = {
    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
    "🤑": "[satisfaction]",
    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
    "😏": "[sarcasm]",
    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
}


# ========== ĐỊNH NGHĨA MÔ HÌNH RNN PYTORCH ==========

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))


# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH PYTORCH RNN ==========

def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
    model.eval()
    with torch.no_grad():
        processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
        tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
        text_tensor = torch.tensor(
            pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400),
            dtype=torch.long
        ).to(device)

        output = model(text_tensor)
        _, predicted = torch.max(output, 1)
        reverse_label_mapping = {v: k for k, v in label_mapping.items()}
        return reverse_label_mapping[predicted.item()]


# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH KERAS CNN-LSTM ==========

def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
    processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
    tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
    text_tensor = pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400)
    output = model.predict(text_tensor)
    predicted = output.argmax(axis=1)[0]
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    return reverse_label_mapping[predicted]


# ========== PHẦN MAIN (CHẠY THỬ) ==========

if __name__ == "__main__":
    # --------------------------
    # Thay đường dẫn tại đây:
    # --------------------------
    file_path = "train.xlsx"               # file Excel gốc (chứa cột "Sentence", "Emotion", ...)
    abbreviations_path = "abbreviations.json"
    word2vec_path = "/home/datpham/datpham/thesis-ngtram/word2vec_vi_syllables_100dims.txt"
    output_path = "processed.xlsx"
    
    data_manager = DataManager(
        file_path=file_path,
        abbreviations_path=abbreviations_path,
        word2vec_path=word2vec_path
    )

    # 1) Đọc và tiền xử lý
    df = data_manager.preprocess_data()
    print("Trước khi undersampling:")
    print(df["Emotion"].value_counts())

    # 2) UNDERSAMPLING (Ví dụ)
    # Chỉnh lại tên emotion cụ thể cho phù hợp tập dữ liệu của bạn
    df_enjoyment = df[df["Emotion"] == "Enjoyment"]
    df_other     = df[df["Emotion"] == "Other"]
    df_anger     = df[df["Emotion"] == "Anger"]
    df_sadness   = df[df["Emotion"] == "Sadness"]
    df_disgust   = df[df["Emotion"] == "Disgust"]
    df_fear      = df[df["Emotion"] == "Fear"]
    df_surprise  = df[df["Emotion"] == "Surprise"]

    # Ví dụ: Chọn 2000 mẫu cho 'Enjoyment'
    if len(df_enjoyment) > 2000:
        df_enjoyment_undersampled = df_enjoyment.sample(n=2000, random_state=42)
    else:
        df_enjoyment_undersampled = df_enjoyment
    
    df_balanced = pd.concat([
        df_enjoyment_undersampled,
        df_other,
        df_anger,
        df_sadness,
        df_disgust,
        df_fear,
        df_surprise
    ], axis=0)

    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    df = df_balanced

    print("\nSau khi undersampling:")
    print(df["Emotion"].value_counts())

    df.to_excel(output_path, index=False)

    # 3) Tạo data loader cho PyTorch
    train_loader, test_loader, label_mapping = data_manager.split_and_convert(
        df, label_column="Emotion", for_keras=False
    )

    vocab_size = len(data_manager.vocabulary)
    embedding_dim = 100
    hidden_dim = 128
    output_dim = len(label_mapping)

    model_rnn = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model_rnn.parameters())

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_rnn.to(device)

    num_epochs = 20
    for epoch in range(num_epochs):
        model_rnn.train()
        epoch_loss = 0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            predictions = model_rnn(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            _, predicted = torch.max(predictions, 1)
            correct += (predicted == y_batch).sum().item()
            total += y_batch.size(0)

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Loss: {epoch_loss/len(train_loader):.4f}, "
              f"Accuracy: {correct/total:.4f}")

    # Đánh giá RNN trên test set
    model_rnn.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model_rnn(X_batch)
            loss = criterion(predictions, y_batch)
            test_loss += loss.item()

            _, predicted = torch.max(predictions, 1)
            correct += (predicted == y_batch).sum().item()
            total += y_batch.size(0)

    print(f"Test Loss: {test_loss/len(test_loader):.4f}, "
          f"Test Accuracy: {correct/total:.4f}")


    # ========== CNN-LSTM (Keras) ==========

    from keras.models import Model
    from keras.layers import Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
    from keras.optimizers import Adam
    from keras.callbacks import ModelCheckpoint

    print("Training CNN-LSTM...")

    X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
        df, label_column="Emotion", for_keras=True
    )

    maxlen = 400

    input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
    emb_layer = Embedding(len(data_manager.vocabulary), embedding_dim)(input_layer)

    def max_1d(X):
        return tf.reduce_max(X, axis=1)

    con3_layer = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
    pool_con3_layer = Lambda(max_1d, output_shape=(150,))(con3_layer)

    con5_layer = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
    pool_con5_layer = Lambda(max_1d, output_shape=(150,))(con5_layer)

    lstm_layer = LSTM(128)(emb_layer)

    cnn_lstm_layer = concatenate([pool_con3_layer, pool_con5_layer, lstm_layer])

    dense_layer   = Dense(100, activation='relu')(cnn_lstm_layer)
    dropout_layer = Dropout(0.2)(dense_layer)
    output_layer  = Dense(len(label_mapping), activation='softmax')(dropout_layer)

    model_cnn_lstm = Model(inputs=input_layer, outputs=output_layer)
    model_cnn_lstm.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

    checkpoint = ModelCheckpoint('cnn_lstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max')
    model_cnn_lstm.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        batch_size=32,
        epochs=20,
        callbacks=[checkpoint]
    )

    model_cnn_lstm.save('cnn_lstm_model.keras')

    loss, accuracy = model_cnn_lstm.evaluate(X_test, y_test)
    print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

    # Demo dự đoán 1 câu mới
    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"

    # RNN (PyTorch)
    emotion_rnn = predict_emotion_rnn(model_rnn, custom_text, data_manager, label_mapping, device)
    print(f"Predicted Emotion (RNN): {emotion_rnn}")

    # CNN-LSTM (Keras)
    cnn_lstm_model = tf.keras.models.load_model('cnn_lstm_model.keras')
    emotion_cnn_lstm = predict_emotion_cnn_lstm(cnn_lstm_model, custom_text, data_manager, label_mapping)
    print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")

    # Kiểm tra phiên bản TF, GPU
    print("TF version:", tf.__version__)
    print("GPU devices:", tf.config.list_physical_devices("GPU"))
    # Có thể kiểm tra CUDA/GPU thông qua lệnh system sau (nếu muốn):
    # import os
    # os.system("nvidia-smi")