# thesis.py # -*- coding: utf-8 -*- import pandas as pd import emoji import json import re import numpy as np from underthesea import word_tokenize from tqdm import tqdm import torch from torchtext.vocab import Vectors from sklearn.model_selection import train_test_split from sklearn.utils import resample from sklearn.metrics import ( accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix ) from tensorflow.keras.preprocessing.sequence import pad_sequences from torch.utils.data import DataLoader, TensorDataset import torch.nn as nn import torch.optim as optim import tensorflow as tf import os # ========== CÁC HÀM TIỀN XỬ LÝ ========== def preprocess_sentence(sentence, abbreviations, emoji_mapping): """ Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục, ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v. """ sentence = sentence.lower() sentence = replace_emojis(sentence, emoji_mapping) sentence = remove_profanity(sentence) sentence = remove_special_characters(sentence) sentence = normalize_whitespace(sentence) sentence = replace_abbreviations(sentence, abbreviations) sentence = remove_repeated_characters(sentence) sentence = replace_numbers(sentence) sentence = tokenize_sentence(sentence) return sentence def replace_emojis(sentence, emoji_mapping): processed_sentence = [] for char in sentence: if char in emoji_mapping: processed_sentence.append(emoji_mapping[char]) elif not emoji.is_emoji(char): processed_sentence.append(char) return ''.join(processed_sentence) def remove_profanity(sentence): profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"] words = sentence.split() filtered_words = [word for word in words if word.lower() not in profane_words] return ' '.join(filtered_words) def remove_special_characters(sentence): return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence) def normalize_whitespace(sentence): return ' '.join(sentence.split()) def replace_abbreviations(sentence, abbreviations): words = sentence.split() replaced_words = [ " ".join(abbreviations[word]) if word in abbreviations else word for word in words ] return ' '.join(replaced_words) def remove_repeated_characters(sentence): # Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá" return re.sub(r"(.)\1{2,}", r"\1", sentence) def replace_numbers(sentence): # Thay toàn bộ số bằng token [number] return re.sub(r"\d+", "[number]", sentence) def tokenize_sentence(sentence): # Tách từ bằng underthesea return ' '.join(word_tokenize(sentence)) # ========== VOCABULARY CLASS ========== class Vocabulary: def __init__(self): self.word2id = {} self.word2id[''] = 0 self.word2id[''] = 1 self.unk_id = 1 self.id2word = {0: '', 1: ''} def __getitem__(self, word): return self.word2id.get(word, self.unk_id) def __contains__(self, word): return word in self.word2id def __len__(self): return len(self.word2id) def lookup_tokens(self, indices): return [self.id2word[idx] for idx in indices] def add(self, word): if word not in self.word2id: idx = len(self.word2id) self.word2id[word] = idx self.id2word[idx] = word @staticmethod def tokenize_corpus(corpus): tokenized_corpus = [] for doc in tqdm(corpus, desc="Tokenizing Corpus"): tokens = [w.replace(" ", "_") for w in word_tokenize(doc)] tokenized_corpus.append(tokens) return tokenized_corpus def corpus_to_tensor(self, corpus, is_tokenized=False): """ corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True) return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token """ tokenized_corpus = ( self.tokenize_corpus(corpus) if not is_tokenized else corpus ) return [ [self[token] for token in doc] for doc in tokenized_corpus ] # ========== EMOJI MAPPING ========== emoji_mapping = { "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]", "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]", "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]", "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]", "🤑": "[satisfaction]", "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]", "😏": "[sarcasm]", "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]", "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]", "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]", "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]", "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]", "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]", "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]", "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]", "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]", "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]" } # ========== DATA MANAGER ========== class DataManager: def __init__(self, file_path, abbreviations_path, word2vec_path): self.file_path = file_path self.abbreviations_path = abbreviations_path self.word2vec_path = word2vec_path self.vocabulary = None self.word_embeddings = None self.abbreviations = None self.load_abbreviations() def load_abbreviations(self): with open(self.abbreviations_path, "r", encoding="utf-8") as f: self.abbreviations = json.load(f) def load_word2vec(self): """ Tải vector từ file word2vec, dùng torchtext.Vectors để load embedding pretrained. """ self.word_embeddings = Vectors( name=self.word2vec_path, unk_init=torch.Tensor.normal_ ) def create_vocab_from_corpus(self, corpus, max_vocab_size=30000): """ Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ. """ vocab = Vocabulary() from collections import Counter counter = Counter() for sent in corpus: for token in sent.split(): counter[token] += 1 most_common = counter.most_common(max_vocab_size) for word, _freq in most_common: vocab.add(word) return vocab def preprocess_data(self): df = pd.read_excel(self.file_path) if "Sentence" not in df.columns: raise ValueError("Cột 'Sentence' không tồn tại trong dataset!") # Tiền xử lý từng câu df["processed_sentence"] = df["Sentence"].apply( lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping) ) # Loại những dòng rỗng df = df[df["processed_sentence"].str.strip().astype(bool)] # Tạo vocab từ chính dữ liệu all_sentences = df["processed_sentence"].tolist() self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000) # Load word2vec self.load_word2vec() return df def build_pretrained_embedding_matrix(self, embedding_dim=100): """ Tạo weight_matrix (numpy) (vocab_size x embedding_dim) với trọng số pretrained. """ vocab_size = len(self.vocabulary) weight_matrix = np.random.normal( scale=0.1, size=(vocab_size, embedding_dim) ).astype(np.float32) # Copy vector pretrained for word, idx in self.vocabulary.word2id.items(): if word in self.word_embeddings.stoi: weight_matrix[idx] = self.word_embeddings.vectors[ self.word_embeddings.stoi[word] ] return weight_matrix def split_and_convert( self, df, label_column="Emotion", maxlen=400, test_size=0.2, for_keras=False, batch_size=32 ): """ Chia dữ liệu thành train/test hoặc train/val/test. - for_keras=False → return train_loader, test_loader, label_mapping (PyTorch) - for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras) """ if label_column not in df.columns: raise ValueError( f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}" ) # Tạo mapping nhãn -> số label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())} df[label_column] = df[label_column].map(label_mapping) if df[label_column].isnull().any(): missing = df[df[label_column].isnull()][label_column].unique() raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}") X = df["processed_sentence"].tolist() y = df[label_column].tolist() # Stratify để duy trì phân phối lớp X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42, stratify=y ) if not for_keras: # Chia train thành train và validation X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.1, random_state=42, stratify=y_train ) # Convert text -> index X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False) X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False) if not for_keras: X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False) # Pad X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post') X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post') if not for_keras: X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post') print(">>> Debug Split and Convert:") print("X_train_padded.shape:", X_train_padded.shape) print("X_val_padded.shape: ", X_val_padded.shape if not for_keras else "N/A") print("X_test_padded.shape: ", X_test_padded.shape) print("y_train length:", len(y_train)) print("y_val length: ", len(y_val) if not for_keras else "N/A") print("y_test length: ", len(y_test)) print("vocab_size:", len(self.vocabulary)) if for_keras: num_classes = len(label_mapping) y_train_onehot = tf.keras.utils.to_categorical( y_train, num_classes=num_classes ) y_test_onehot = tf.keras.utils.to_categorical( y_test, num_classes=num_classes ) print("y_train_onehot.shape:", y_train_onehot.shape) print("y_test_onehot.shape: ", y_test_onehot.shape) return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping else: # Convert validation set X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False) X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post') X_train_t = torch.tensor(X_train_padded, dtype=torch.long) X_val_t = torch.tensor(X_val_padded, dtype=torch.long) X_test_t = torch.tensor(X_test_padded, dtype=torch.long) y_train_t = torch.tensor(y_train, dtype=torch.long) y_val_t = torch.tensor(y_val, dtype=torch.long) y_test_t = torch.tensor(y_test, dtype=torch.long) train_ds = TensorDataset(X_train_t, y_train_t) val_ds = TensorDataset(X_val_t, y_val_t) test_ds = TensorDataset(X_test_t, y_test_t) train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False) test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False) return train_loader, val_loader, test_loader, label_mapping # ========== MÔ HÌNH KERAS BI-LSTM ========== def predict_emotion_bilstm(model, text, data_manager, label_mapping): processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping) tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text]) text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True) text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post') output = model.predict(text_padded) pred = output.argmax(axis=1)[0] rev_map = {v: k for k, v in label_mapping.items()} return rev_map[pred] # ========== MAIN ========== if __name__ == "__main__": from keras.models import Model from keras.layers import ( Input, Embedding, Dense, Dropout, Bidirectional, LSTM ) from keras.optimizers import Adam from keras.callbacks import ModelCheckpoint, EarlyStopping # -------- ĐƯỜNG DẪN ---------- file_path = "train.xlsx" abbreviations_path = "abbreviations.json" word2vec_path = "word2vec_vi_syllables_100dims.txt" output_path = "processed.xlsx" # Khởi tạo DataManager data_manager = DataManager( file_path=file_path, abbreviations_path=abbreviations_path, word2vec_path=word2vec_path ) # 1) Tiền xử lý, tạo vocab, load word2vec df = data_manager.preprocess_data() print("Trước khi cân bằng lớp (undersampling/oversampling):") print(df["Emotion"].value_counts()) # 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000) # Bạn có thể điều chỉnh theo nhu cầu của mình df_enjoyment = df[df["Emotion"] == "Enjoyment"] df_other = df[df["Emotion"] == "Other"] df_anger = df[df["Emotion"] == "Anger"] df_sadness = df[df["Emotion"] == "Sadness"] df_disgust = df[df["Emotion"] == "Disgust"] df_fear = df[df["Emotion"] == "Fear"] df_surprise = df[df["Emotion"] == "Surprise"] # Oversample lớp 'Other' lên 3000 (chỉ minh hoạ) if len(df_other) < 3000: df_other_oversampled = resample( df_other, replace=True, n_samples=3000, random_state=42 ) else: df_other_oversampled = df_other # Giữ nguyên các lớp khác (hoặc oversample tùy ý) df_balanced = pd.concat([ df_enjoyment, df_other_oversampled, df_anger, df_sadness, df_disgust, df_fear, df_surprise ], axis=0) df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True) df = df_balanced print("\nSau khi cân bằng lớp (demo oversample):") print(df["Emotion"].value_counts()) # Xuất file (nếu muốn) df.to_excel(output_path, index=False) # ========== TRAIN BI-LSTM KERAS ========== print("\n========== Training Keras BiLSTM ==========") # Tạo embedding pretrained cho Keras pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100) pretrained_matrix_keras = pretrained_matrix.astype(np.float32) # Split data for Keras X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert( df, label_column="Emotion", maxlen=400, test_size=0.2, for_keras=True ) num_classes = len(label_mapping) input_dim = len(data_manager.vocabulary) embedding_dim = pretrained_matrix.shape[1] maxlen = 400 # Define BiLSTM Model def create_bilstm_model(): input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input') emb_layer = Embedding( input_dim=input_dim, output_dim=embedding_dim, weights=[pretrained_matrix_keras], input_length=maxlen, trainable=True # Set to False nếu bạn không muốn fine-tune embeddings )(input_layer) bilstm = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5))(emb_layer) dense1 = Dense(64, activation='relu')(bilstm) dropout1 = Dropout(0.5)(dense1) dense2 = Dense(32, activation='relu')(dropout1) dropout2 = Dropout(0.5)(dense2) output_layer = Dense(num_classes, activation='softmax')(dropout2) model = Model(inputs=input_layer, outputs=output_layer) model.compile( loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'] ) return model # Create model model_bilstm = create_bilstm_model() model_bilstm.summary() # Define callbacks checkpoint = ModelCheckpoint( 'bilstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max' ) early_stopping = EarlyStopping( monitor='val_accuracy', patience=5, restore_best_weights=True ) # Train model history = model_bilstm.fit( X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32, callbacks=[checkpoint, early_stopping] ) # Đánh giá trên test set với detailed metrics loss, acc = model_bilstm.evaluate(X_test, y_test) print(f"BiLSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}") # Thu thập dự đoán và tính toán các chỉ số y_pred_bilstm = model_bilstm.predict(X_test) y_pred_bilstm = np.argmax(y_pred_bilstm, axis=1) y_true_bilstm = np.argmax(y_test, axis=1) test_accuracy_bilstm = accuracy_score(y_true_bilstm, y_pred_bilstm) precision_macro_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0) precision_weighted_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0) recall_macro_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0) recall_weighted_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0) f1_macro_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0) f1_weighted_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0) report_bilstm = classification_report(y_true_bilstm, y_pred_bilstm, target_names=label_mapping.keys(), digits=4) conf_matrix_bilstm = confusion_matrix(y_true_bilstm, y_pred_bilstm) # In các chỉ số print(f"\nBiLSTM Test Accuracy: {test_accuracy_bilstm:.4f}") print(f"Precision (Macro): {precision_macro_bilstm:.4f}") print(f"Precision (Weighted): {precision_weighted_bilstm:.4f}") print(f"Recall (Macro): {recall_macro_bilstm:.4f}") print(f"Recall (Weighted): {recall_weighted_bilstm:.4f}") print(f"F1-Score (Macro): {f1_macro_bilstm:.4f}") print(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}") print("\n========== BiLSTM Classification Report ==========") print(report_bilstm) print("\n========== BiLSTM Confusion Matrix ==========") print(conf_matrix_bilstm) # Lưu báo cáo vào file bilstm_report_dir = "bilstm_emotion_model" os.makedirs(bilstm_report_dir, exist_ok=True) with open(os.path.join(bilstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f: f.write("========== BiLSTM Classification Report ==========\n") f.write(report_bilstm) f.write("\n========== Additional Metrics ==========\n") f.write(f"Test Loss: {loss:.4f}\n") f.write(f"Test Accuracy: {test_accuracy_bilstm:.4f}\n") f.write(f"Precision (Macro): {precision_macro_bilstm:.4f}\n") f.write(f"Precision (Weighted): {precision_weighted_bilstm:.4f}\n") f.write(f"Recall (Macro): {recall_macro_bilstm:.4f}\n") f.write(f"Recall (Weighted): {recall_weighted_bilstm:.4f}\n") f.write(f"F1-Score (Macro): {f1_macro_bilstm:.4f}\n") f.write(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}\n") f.write("\n========== Confusion Matrix ==========\n") f.write(np.array2string(conf_matrix_bilstm)) print("\n========== BiLSTM Classification Report saved to 'bilstm_emotion_model/classification_report.txt' ==========") # Lưu mô hình BiLSTM model_bilstm.save(os.path.join(bilstm_report_dir, 'bilstm_model.keras')) print(f"========== BiLSTM Model saved to '{bilstm_report_dir}/bilstm_model.keras' ==========") # ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ========== custom_text = "Tôi rất vui khi sử dụng dịch vụ này!" # BiLSTM (Keras) emotion_bilstm = predict_emotion_bilstm( model_bilstm, custom_text, data_manager, label_mapping ) print(f"Predicted Emotion (BiLSTM): {emotion_bilstm}") # Kiểm tra TF, GPU print("TF version:", tf.__version__) print("GPU devices:", tf.config.list_physical_devices("GPU")) # os.system("nvidia-smi") # nếu muốn xem info GPU # ========== LƯU LABEL MAPPING VÀ VOCABULARY ========== # Lưu label_mapping và vocabulary cho BiLSTM with open(os.path.join(bilstm_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f: json.dump(label_mapping, f, ensure_ascii=False, indent=4) with open(os.path.join(bilstm_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f: json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4) print("========== Label Mapping and Vocabulary saved ==========")