# thesis.py # -*- coding: utf-8 -*- import pandas as pd import emoji import json import re from underthesea import word_tokenize from tqdm import tqdm import torch from torchtext.vocab import Vectors from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.sequence import pad_sequences from torch.utils.data import DataLoader, TensorDataset import torch.nn as nn import torch.optim as optim import numpy as np import tensorflow as tf # ========== CÁC HÀM TIỀN XỬ LÝ ========== def preprocess_sentence(sentence, abbreviations, emoji_mapping): """ Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục, ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v. """ sentence = sentence.lower() sentence = replace_emojis(sentence, emoji_mapping) sentence = remove_profanity(sentence) sentence = remove_special_characters(sentence) sentence = normalize_whitespace(sentence) sentence = replace_abbreviations(sentence, abbreviations) sentence = remove_repeated_characters(sentence) sentence = replace_numbers(sentence) sentence = tokenize_sentence(sentence) return sentence def replace_emojis(sentence, emoji_mapping): processed_sentence = [] for char in sentence: if char in emoji_mapping: processed_sentence.append(emoji_mapping[char]) elif not emoji.is_emoji(char): processed_sentence.append(char) return ''.join(processed_sentence) def remove_profanity(sentence): profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"] words = sentence.split() filtered_words = [word for word in words if word.lower() not in profane_words] return ' '.join(filtered_words) def remove_special_characters(sentence): return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence) def normalize_whitespace(sentence): return ' '.join(sentence.split()) def replace_abbreviations(sentence, abbreviations): words = sentence.split() replaced_words = [ " ".join(abbreviations[word]) if word in abbreviations else word for word in words ] return ' '.join(replaced_words) def remove_repeated_characters(sentence): return re.sub(r"(.)\1{2,}", r"\1", sentence) def replace_numbers(sentence): return re.sub(r"\d+", "[number]", sentence) def tokenize_sentence(sentence): return ' '.join(word_tokenize(sentence)) # ========== LỚP DATA MANAGER ========== class DataManager: def __init__(self, file_path, abbreviations_path, word2vec_path): self.file_path = file_path self.abbreviations_path = abbreviations_path self.word2vec_path = word2vec_path self.load_abbreviations() self.load_word2vec() def load_abbreviations(self): with open(self.abbreviations_path, "r", encoding="utf-8") as file: self.abbreviations = json.load(file) def load_word2vec(self): # Tải vector từ file word2vec, unk_init để từ vựng ngoài tập sẽ random normal self.word_embeddings = Vectors(name=self.word2vec_path, unk_init=torch.Tensor.normal_) self.vocabulary = self.create_vocab_from_word2vec() def create_vocab_from_word2vec(self): vocab = Vocabulary() words_list = list(self.word_embeddings.stoi.keys()) for word in words_list: vocab.add(word) return vocab def preprocess_data(self): df = pd.read_excel(self.file_path) if "Sentence" not in df.columns: raise ValueError("Cột 'Sentence' không tồn tại trong dataset!") # Tiền xử lý từng câu df["processed_sentence"] = df["Sentence"].apply( lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping) ) # Loại bỏ những dòng rỗng sau khi xử lý df = df[df["processed_sentence"].str.strip().astype(bool)] return df def split_and_convert( self, df, label_column="Emotion", maxlen=400, test_size=0.2, for_keras=False, batch_size=32 ): """ Chia dữ liệu thành train/test. Trả về: - Nếu for_keras=False: train_loader, test_loader, label_mapping (PyTorch) - Nếu for_keras=True: X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras) """ if label_column not in df.columns: raise ValueError( f"Cột '{label_column}' không tồn tại trong DataFrame. " f"Các cột hiện có: {df.columns.tolist()}" ) # Tạo mapping nhãn -> số label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())} df[label_column] = df[label_column].map(label_mapping) X = df["processed_sentence"].tolist() y = df[label_column].tolist() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) # Chuyển văn bản thành tensor chỉ số X_train_tensors = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False) X_test_tensors = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False) # Pad sequences X_train_padded = pad_sequences(X_train_tensors, maxlen=maxlen) X_test_padded = pad_sequences(X_test_tensors, maxlen=maxlen) # Debug thông tin print(">>> Debug Split and Convert:") print("X_train_padded.shape:", X_train_padded.shape) print("X_test_padded.shape: ", X_test_padded.shape) print("y_train length:", len(y_train)) print("y_test length: ", len(y_test)) # Kiểm tra min/max token max_token_train = np.max(X_train_padded) if X_train_padded.size > 0 else None min_token_train = np.min(X_train_padded) if X_train_padded.size > 0 else None max_token_test = np.max(X_test_padded) if X_test_padded.size > 0 else None min_token_test = np.min(X_test_padded) if X_test_padded.size > 0 else None vocab_size = len(self.vocabulary) print(f"vocab_size: {vocab_size}") print(f"max_token_train: {max_token_train}, min_token_train: {min_token_train}") print(f"max_token_test: {max_token_test}, min_token_test: {min_token_test}") if for_keras: num_classes = len(label_mapping) # One-hot cho nhãn y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train), num_classes=num_classes).numpy() y_test_onehot = torch.nn.functional.one_hot(torch.tensor(y_test), num_classes=num_classes).numpy() # Debug print("y_train_onehot.shape:", y_train_onehot.shape) print("y_test_onehot.shape: ", y_test_onehot.shape) return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping else: # Trả về DataLoader cho PyTorch X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long) X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long) y_train_tensor = torch.tensor(y_train, dtype=torch.long) y_test_tensor = torch.tensor(y_test, dtype=torch.long) train_dataset = TensorDataset(X_train_tensor, y_train_tensor) test_dataset = TensorDataset(X_test_tensor, y_test_tensor) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) return train_loader, test_loader, label_mapping # ========== LỚP TỪ ĐIỂN (VOCABULARY) ========== class Vocabulary: def __init__(self): self.word2id = {} self.word2id[''] = 0 self.word2id[''] = 1 self.unk_id = self.word2id[''] self.id2word = {0: '', 1: ''} def __getitem__(self, word): return self.word2id.get(word, self.unk_id) def __contains__(self, word): return word in self.word2id def __len__(self): return len(self.word2id) def lookup_tokens(self, word_indexes: list): return [self.id2word[word_index] for word_index in word_indexes] def add(self, word): if word not in self: word_index = len(self.word2id) self.word2id[word] = word_index self.id2word[word_index] = word return word_index else: return self[word] @staticmethod def tokenize_corpus(corpus): tokenized_corpus = [] for document in tqdm(corpus): tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)] tokenized_corpus.append(tokenized_document) return tokenized_corpus def corpus_to_tensor(self, corpus, is_tokenized=False): tokenized_corpus = self.tokenize_corpus(corpus) if not is_tokenized else corpus return [ [self[word] for word in document] for document in tokenized_corpus ] # ========== MAPPING EMOJI => NHÃN ========== emoji_mapping = { "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]", "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]", "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]", "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]", "🤑": "[satisfaction]", "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]", "😏": "[sarcasm]", "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]", "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]", "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]", "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]", "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]", "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]", "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]", "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]", "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]", "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]" } # ========== ĐỊNH NGHĨA MÔ HÌNH RNN PYTORCH ========== class SimpleRNN(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim): super(SimpleRNN, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim) def forward(self, x): embedded = self.embedding(x) _, (hidden, _) = self.rnn(embedded) return self.fc(hidden.squeeze(0)) # ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH PYTORCH RNN ========== def predict_emotion_rnn(model, text, data_manager, label_mapping, device): model.eval() with torch.no_grad(): processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping) tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text]) text_tensor = torch.tensor( pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400), dtype=torch.long ).to(device) output = model(text_tensor) _, predicted = torch.max(output, 1) reverse_label_mapping = {v: k for k, v in label_mapping.items()} return reverse_label_mapping[predicted.item()] # ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH KERAS CNN-LSTM ========== def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping): processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping) tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text]) text_tensor = pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400) output = model.predict(text_tensor) predicted = output.argmax(axis=1)[0] reverse_label_mapping = {v: k for k, v in label_mapping.items()} return reverse_label_mapping[predicted] # ========== PHẦN MAIN (CHẠY THỬ) ========== if __name__ == "__main__": # -------------------------- # Thay đường dẫn tại đây: # -------------------------- file_path = "train.xlsx" # file Excel gốc (chứa cột "Sentence", "Emotion", ...) abbreviations_path = "abbreviations.json" word2vec_path = "/home/datpham/datpham/thesis-ngtram/word2vec_vi_syllables_100dims.txt" output_path = "processed.xlsx" data_manager = DataManager( file_path=file_path, abbreviations_path=abbreviations_path, word2vec_path=word2vec_path ) # 1) Đọc và tiền xử lý df = data_manager.preprocess_data() print("Trước khi undersampling:") print(df["Emotion"].value_counts()) # 2) UNDERSAMPLING (Ví dụ) # Chỉnh lại tên emotion cụ thể cho phù hợp tập dữ liệu của bạn df_enjoyment = df[df["Emotion"] == "Enjoyment"] df_other = df[df["Emotion"] == "Other"] df_anger = df[df["Emotion"] == "Anger"] df_sadness = df[df["Emotion"] == "Sadness"] df_disgust = df[df["Emotion"] == "Disgust"] df_fear = df[df["Emotion"] == "Fear"] df_surprise = df[df["Emotion"] == "Surprise"] # Ví dụ: Chọn 2000 mẫu cho 'Enjoyment' if len(df_enjoyment) > 2000: df_enjoyment_undersampled = df_enjoyment.sample(n=2000, random_state=42) else: df_enjoyment_undersampled = df_enjoyment df_balanced = pd.concat([ df_enjoyment_undersampled, df_other, df_anger, df_sadness, df_disgust, df_fear, df_surprise ], axis=0) df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True) df = df_balanced print("\nSau khi undersampling:") print(df["Emotion"].value_counts()) df.to_excel(output_path, index=False) # 3) Tạo data loader cho PyTorch train_loader, test_loader, label_mapping = data_manager.split_and_convert( df, label_column="Emotion", for_keras=False ) vocab_size = len(data_manager.vocabulary) embedding_dim = 100 hidden_dim = 128 output_dim = len(label_mapping) model_rnn = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model_rnn.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_rnn.to(device) num_epochs = 20 for epoch in range(num_epochs): model_rnn.train() epoch_loss = 0 correct = 0 total = 0 for X_batch, y_batch in train_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() predictions = model_rnn(X_batch) loss = criterion(predictions, y_batch) loss.backward() optimizer.step() epoch_loss += loss.item() _, predicted = torch.max(predictions, 1) correct += (predicted == y_batch).sum().item() total += y_batch.size(0) print(f"Epoch {epoch+1}/{num_epochs}, " f"Loss: {epoch_loss/len(train_loader):.4f}, " f"Accuracy: {correct/total:.4f}") # Đánh giá RNN trên test set model_rnn.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for X_batch, y_batch in test_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) predictions = model_rnn(X_batch) loss = criterion(predictions, y_batch) test_loss += loss.item() _, predicted = torch.max(predictions, 1) correct += (predicted == y_batch).sum().item() total += y_batch.size(0) print(f"Test Loss: {test_loss/len(test_loader):.4f}, " f"Test Accuracy: {correct/total:.4f}") # ========== CNN-LSTM (Keras) ========== from keras.models import Model from keras.layers import Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate from keras.optimizers import Adam from keras.callbacks import ModelCheckpoint print("Training CNN-LSTM...") X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert( df, label_column="Emotion", for_keras=True ) maxlen = 400 input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input') emb_layer = Embedding(len(data_manager.vocabulary), embedding_dim)(input_layer) def max_1d(X): return tf.reduce_max(X, axis=1) con3_layer = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer) pool_con3_layer = Lambda(max_1d, output_shape=(150,))(con3_layer) con5_layer = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer) pool_con5_layer = Lambda(max_1d, output_shape=(150,))(con5_layer) lstm_layer = LSTM(128)(emb_layer) cnn_lstm_layer = concatenate([pool_con3_layer, pool_con5_layer, lstm_layer]) dense_layer = Dense(100, activation='relu')(cnn_lstm_layer) dropout_layer = Dropout(0.2)(dense_layer) output_layer = Dense(len(label_mapping), activation='softmax')(dropout_layer) model_cnn_lstm = Model(inputs=input_layer, outputs=output_layer) model_cnn_lstm.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) checkpoint = ModelCheckpoint('cnn_lstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max') model_cnn_lstm.fit( X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=20, callbacks=[checkpoint] ) model_cnn_lstm.save('cnn_lstm_model.keras') loss, accuracy = model_cnn_lstm.evaluate(X_test, y_test) print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}") # Demo dự đoán 1 câu mới custom_text = "Tôi rất vui khi sử dụng dịch vụ này!" # RNN (PyTorch) emotion_rnn = predict_emotion_rnn(model_rnn, custom_text, data_manager, label_mapping, device) print(f"Predicted Emotion (RNN): {emotion_rnn}") # CNN-LSTM (Keras) cnn_lstm_model = tf.keras.models.load_model('cnn_lstm_model.keras') emotion_cnn_lstm = predict_emotion_cnn_lstm(cnn_lstm_model, custom_text, data_manager, label_mapping) print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}") # Kiểm tra phiên bản TF, GPU print("TF version:", tf.__version__) print("GPU devices:", tf.config.list_physical_devices("GPU")) # Có thể kiểm tra CUDA/GPU thông qua lệnh system sau (nếu muốn): # import os # os.system("nvidia-smi")