|
|
|
|
|
|
|
import pandas as pd |
|
import emoji |
|
import json |
|
import re |
|
import numpy as np |
|
from underthesea import word_tokenize |
|
from tqdm import tqdm |
|
import torch |
|
from torchtext.vocab import Vectors |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.utils import resample |
|
from sklearn.metrics import ( |
|
accuracy_score, |
|
classification_report, |
|
precision_score, |
|
recall_score, |
|
f1_score, |
|
confusion_matrix |
|
) |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from torch.utils.data import DataLoader, TensorDataset |
|
import torch.nn as nn |
|
import torch.optim as optim |
|
import tensorflow as tf |
|
import os |
|
import joblib |
|
|
|
|
|
|
|
def preprocess_sentence(sentence, abbreviations, emoji_mapping): |
|
""" |
|
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục, |
|
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v. |
|
""" |
|
sentence = sentence.lower() |
|
sentence = replace_emojis(sentence, emoji_mapping) |
|
sentence = remove_profanity(sentence) |
|
sentence = remove_special_characters(sentence) |
|
sentence = normalize_whitespace(sentence) |
|
sentence = replace_abbreviations(sentence, abbreviations) |
|
sentence = remove_repeated_characters(sentence) |
|
sentence = replace_numbers(sentence) |
|
sentence = tokenize_sentence(sentence) |
|
return sentence |
|
|
|
def replace_emojis(sentence, emoji_mapping): |
|
processed_sentence = [] |
|
for char in sentence: |
|
if char in emoji_mapping: |
|
processed_sentence.append(emoji_mapping[char]) |
|
elif not emoji.is_emoji(char): |
|
processed_sentence.append(char) |
|
return ''.join(processed_sentence) |
|
|
|
def remove_profanity(sentence): |
|
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"] |
|
words = sentence.split() |
|
filtered_words = [word for word in words if word.lower() not in profane_words] |
|
return ' '.join(filtered_words) |
|
|
|
def remove_special_characters(sentence): |
|
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence) |
|
|
|
def normalize_whitespace(sentence): |
|
return ' '.join(sentence.split()) |
|
|
|
def replace_abbreviations(sentence, abbreviations): |
|
words = sentence.split() |
|
replaced_words = [ |
|
" ".join(abbreviations[word]) if word in abbreviations else word |
|
for word in words |
|
] |
|
return ' '.join(replaced_words) |
|
|
|
def remove_repeated_characters(sentence): |
|
|
|
return re.sub(r"(.)\1{2,}", r"\1", sentence) |
|
|
|
def replace_numbers(sentence): |
|
|
|
return re.sub(r"\d+", "[number]", sentence) |
|
|
|
def tokenize_sentence(sentence): |
|
|
|
return ' '.join(word_tokenize(sentence)) |
|
|
|
|
|
|
|
|
|
class Vocabulary: |
|
def __init__(self): |
|
self.word2id = {} |
|
self.word2id['<pad>'] = 0 |
|
self.word2id['<unk>'] = 1 |
|
self.unk_id = 1 |
|
self.id2word = {0: '<pad>', 1: '<unk>'} |
|
|
|
def __getitem__(self, word): |
|
return self.word2id.get(word, self.unk_id) |
|
|
|
def __contains__(self, word): |
|
return word in self.word2id |
|
|
|
def __len__(self): |
|
return len(self.word2id) |
|
|
|
def lookup_tokens(self, indices): |
|
return [self.id2word[idx] for idx in indices] |
|
|
|
def add(self, word): |
|
if word not in self.word2id: |
|
idx = len(self.word2id) |
|
self.word2id[word] = idx |
|
self.id2word[idx] = word |
|
|
|
@staticmethod |
|
def tokenize_corpus(corpus): |
|
tokenized_corpus = [] |
|
for doc in tqdm(corpus, desc="Tokenizing Corpus"): |
|
tokens = [w.replace(" ", "_") for w in word_tokenize(doc)] |
|
tokenized_corpus.append(tokens) |
|
return tokenized_corpus |
|
|
|
def corpus_to_tensor(self, corpus, is_tokenized=False): |
|
""" |
|
corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True) |
|
return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token |
|
""" |
|
tokenized_corpus = ( |
|
self.tokenize_corpus(corpus) if not is_tokenized else corpus |
|
) |
|
return [ |
|
[self[token] for token in doc] |
|
for doc in tokenized_corpus |
|
] |
|
|
|
|
|
|
|
|
|
emoji_mapping = { |
|
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]", |
|
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]", |
|
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]", |
|
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]", |
|
"🤑": "[satisfaction]", |
|
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]", |
|
"😏": "[sarcasm]", |
|
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]", |
|
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]", |
|
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]", |
|
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]", |
|
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]", |
|
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]", |
|
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]", |
|
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]", |
|
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]", |
|
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]" |
|
} |
|
|
|
def load_abbreviations(path): |
|
with open(path, "r", encoding="utf-8") as f: |
|
return json.load(f) |
|
|
|
|
|
|
|
|
|
class DataManager: |
|
def __init__(self, file_path, abbreviations_path, word2vec_path): |
|
self.file_path = file_path |
|
self.abbreviations_path = abbreviations_path |
|
self.word2vec_path = word2vec_path |
|
self.vocabulary = None |
|
self.word_embeddings = None |
|
self.abbreviations = None |
|
self.load_abbreviations() |
|
|
|
def load_abbreviations(self): |
|
with open(self.abbreviations_path, "r", encoding="utf-8") as f: |
|
self.abbreviations = json.load(f) |
|
|
|
def load_word2vec(self): |
|
""" |
|
Tải vector từ file word2vec, |
|
dùng torchtext.Vectors để load embedding pretrained. |
|
""" |
|
self.word_embeddings = Vectors( |
|
name=self.word2vec_path, |
|
unk_init=torch.Tensor.normal_ |
|
) |
|
|
|
def create_vocab_from_corpus(self, corpus, max_vocab_size=30000): |
|
""" |
|
Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ. |
|
""" |
|
vocab = Vocabulary() |
|
from collections import Counter |
|
counter = Counter() |
|
|
|
for sent in corpus: |
|
for token in sent.split(): |
|
counter[token] += 1 |
|
|
|
most_common = counter.most_common(max_vocab_size) |
|
for word, _freq in most_common: |
|
vocab.add(word) |
|
|
|
return vocab |
|
|
|
def preprocess_data(self): |
|
df = pd.read_excel(self.file_path) |
|
if "Sentence" not in df.columns: |
|
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!") |
|
|
|
|
|
df["processed_sentence"] = df["Sentence"].apply( |
|
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping) |
|
) |
|
|
|
|
|
df = df[df["processed_sentence"].str.strip().astype(bool)] |
|
|
|
|
|
all_sentences = df["processed_sentence"].tolist() |
|
self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000) |
|
|
|
|
|
self.load_word2vec() |
|
|
|
return df |
|
|
|
def build_pretrained_embedding_matrix(self, embedding_dim=100): |
|
""" |
|
Tạo weight_matrix (numpy) (vocab_size x embedding_dim) |
|
với trọng số pretrained. |
|
""" |
|
vocab_size = len(self.vocabulary) |
|
weight_matrix = np.random.normal( |
|
scale=0.1, size=(vocab_size, embedding_dim) |
|
).astype(np.float32) |
|
|
|
|
|
for word, idx in self.vocabulary.word2id.items(): |
|
if word in self.word_embeddings.stoi: |
|
weight_matrix[idx] = self.word_embeddings.vectors[ |
|
self.word_embeddings.stoi[word] |
|
] |
|
|
|
return weight_matrix |
|
|
|
def split_and_convert( |
|
self, df, label_column="Emotion", maxlen=400, test_size=0.2, |
|
for_keras=False, batch_size=32 |
|
): |
|
""" |
|
Chia dữ liệu thành train/test. |
|
- for_keras=False → return train_loader, test_loader, label_mapping (PyTorch) |
|
- for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras) |
|
""" |
|
if label_column not in df.columns: |
|
raise ValueError( |
|
f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}" |
|
) |
|
|
|
|
|
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())} |
|
df[label_column] = df[label_column].map(label_mapping) |
|
if df[label_column].isnull().any(): |
|
missing = df[df[label_column].isnull()][label_column].unique() |
|
raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}") |
|
|
|
X = df["processed_sentence"].tolist() |
|
y = df[label_column].tolist() |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
X, y, test_size=test_size, random_state=42, stratify=y |
|
) |
|
|
|
|
|
X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False) |
|
X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False) |
|
|
|
|
|
X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post') |
|
X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post') |
|
|
|
print(">>> Debug Split and Convert:") |
|
print("X_train_padded.shape:", X_train_padded.shape) |
|
print("X_test_padded.shape: ", X_test_padded.shape) |
|
print("y_train length:", len(y_train)) |
|
print("y_test length: ", len(y_test)) |
|
print("vocab_size:", len(self.vocabulary)) |
|
|
|
if for_keras: |
|
num_classes = len(label_mapping) |
|
y_train_onehot = torch.nn.functional.one_hot( |
|
torch.tensor(y_train), |
|
num_classes=num_classes |
|
).numpy() |
|
y_test_onehot = torch.nn.functional.one_hot( |
|
torch.tensor(y_test), |
|
num_classes=num_classes |
|
).numpy() |
|
|
|
print("y_train_onehot.shape:", y_train_onehot.shape) |
|
print("y_test_onehot.shape: ", y_test_onehot.shape) |
|
|
|
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping |
|
else: |
|
|
|
X_train_t = torch.tensor(X_train_padded, dtype=torch.long) |
|
X_test_t = torch.tensor(X_test_padded, dtype=torch.long) |
|
y_train_t = torch.tensor(y_train, dtype=torch.long) |
|
y_test_t = torch.tensor(y_test, dtype=torch.long) |
|
|
|
train_ds = TensorDataset(X_train_t, y_train_t) |
|
test_ds = TensorDataset(X_test_t, y_test_t) |
|
|
|
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True) |
|
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False) |
|
|
|
return train_loader, test_loader, label_mapping |
|
|
|
|
|
|
|
|
|
class SimpleRNN(nn.Module): |
|
def __init__(self, pretrained_weight, hidden_dim, output_dim, dropout=0.3): |
|
super(SimpleRNN, self).__init__() |
|
vocab_size, embedding_dim = pretrained_weight.shape |
|
|
|
self.embedding = nn.Embedding.from_pretrained( |
|
torch.from_numpy(pretrained_weight), |
|
freeze=False |
|
) |
|
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) |
|
self.dropout = nn.Dropout(dropout) |
|
self.fc = nn.Linear(hidden_dim, output_dim) |
|
|
|
def forward(self, x): |
|
embedded = self.dropout(self.embedding(x)) |
|
_, (hidden, _) = self.rnn(embedded) |
|
hidden = self.dropout(hidden.squeeze(0)) |
|
output = self.fc(hidden) |
|
return output |
|
|
|
|
|
def predict_emotion_rnn(model, text, data_manager, label_mapping, device): |
|
model.eval() |
|
with torch.no_grad(): |
|
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping) |
|
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text]) |
|
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True) |
|
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post') |
|
text_tensor = torch.tensor( |
|
text_padded, |
|
dtype=torch.long |
|
).to(device) |
|
|
|
output = model(text_tensor) |
|
_, predicted = torch.max(output, 1) |
|
rev_map = {v: k for k, v in label_mapping.items()} |
|
return rev_map[predicted.item()] |
|
|
|
|
|
|
|
|
|
def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping): |
|
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping) |
|
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text]) |
|
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True) |
|
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post') |
|
output = model.predict(text_padded) |
|
pred = output.argmax(axis=1)[0] |
|
rev_map = {v: k for k, v in label_mapping.items()} |
|
return rev_map[pred] |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
from keras.models import Model |
|
from keras.layers import ( |
|
Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate |
|
) |
|
from keras.optimizers import Adam |
|
from keras.callbacks import ModelCheckpoint, EarlyStopping |
|
|
|
|
|
file_path = "train.xlsx" |
|
abbreviations_path = "abbreviations.json" |
|
word2vec_path = "word2vec_vi_syllables_100dims.txt" |
|
output_path = "processed.xlsx" |
|
|
|
|
|
data_manager = DataManager( |
|
file_path=file_path, |
|
abbreviations_path=abbreviations_path, |
|
word2vec_path=word2vec_path |
|
) |
|
|
|
|
|
df = data_manager.preprocess_data() |
|
print("Trước khi cân bằng lớp (undersampling/oversampling):") |
|
print(df["Emotion"].value_counts()) |
|
|
|
|
|
|
|
df_enjoyment = df[df["Emotion"] == "Enjoyment"] |
|
df_other = df[df["Emotion"] == "Other"] |
|
df_anger = df[df["Emotion"] == "Anger"] |
|
df_sadness = df[df["Emotion"] == "Sadness"] |
|
df_disgust = df[df["Emotion"] == "Disgust"] |
|
df_fear = df[df["Emotion"] == "Fear"] |
|
df_surprise = df[df["Emotion"] == "Surprise"] |
|
|
|
|
|
if len(df_other) < 3000: |
|
df_other_oversampled = resample( |
|
df_other, |
|
replace=True, |
|
n_samples=3000, |
|
random_state=42 |
|
) |
|
else: |
|
df_other_oversampled = df_other |
|
|
|
|
|
df_balanced = pd.concat([ |
|
df_enjoyment, |
|
df_other_oversampled, |
|
df_anger, |
|
df_sadness, |
|
df_disgust, |
|
df_fear, |
|
df_surprise |
|
], axis=0) |
|
|
|
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True) |
|
df = df_balanced |
|
|
|
print("\nSau khi cân bằng lớp (demo oversample):") |
|
print(df["Emotion"].value_counts()) |
|
|
|
|
|
df.to_excel(output_path, index=False) |
|
|
|
|
|
|
|
print("\n========== Training PyTorch SimpleRNN ==========") |
|
|
|
|
|
pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100) |
|
|
|
|
|
train_loader, test_loader, label_mapping = data_manager.split_and_convert( |
|
df, label_column="Emotion", maxlen=400, test_size=0.2, |
|
for_keras=False, batch_size=32 |
|
) |
|
|
|
hidden_dim = 128 |
|
output_dim = len(label_mapping) |
|
|
|
model_rnn = SimpleRNN(pretrained_weight=pretrained_matrix, |
|
hidden_dim=hidden_dim, |
|
output_dim=output_dim, |
|
dropout=0.3) |
|
criterion = nn.CrossEntropyLoss() |
|
optimizer = optim.Adam(model_rnn.parameters(), lr=1e-3) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model_rnn.to(device) |
|
|
|
num_epochs = 20 |
|
for epoch in range(num_epochs): |
|
model_rnn.train() |
|
epoch_loss = 0 |
|
correct = 0 |
|
total = 0 |
|
|
|
for X_batch, y_batch in train_loader: |
|
X_batch = X_batch.to(device) |
|
y_batch = y_batch.to(device) |
|
|
|
optimizer.zero_grad() |
|
preds = model_rnn(X_batch) |
|
loss = criterion(preds, y_batch) |
|
loss.backward() |
|
optimizer.step() |
|
|
|
epoch_loss += loss.item() |
|
_, pred_label = torch.max(preds, 1) |
|
correct += (pred_label == y_batch).sum().item() |
|
total += y_batch.size(0) |
|
|
|
epoch_accuracy = correct / total |
|
epoch_loss_avg = epoch_loss / len(train_loader) |
|
print(f"Epoch {epoch+1}/{num_epochs}, " |
|
f"Loss: {epoch_loss_avg:.4f}, " |
|
f"Accuracy: {epoch_accuracy:.4f}") |
|
|
|
|
|
model_rnn.eval() |
|
test_loss = 0 |
|
correct = 0 |
|
total = 0 |
|
y_true = [] |
|
y_pred = [] |
|
with torch.no_grad(): |
|
for X_batch, y_batch in test_loader: |
|
X_batch = X_batch.to(device) |
|
y_batch = y_batch.to(device) |
|
preds = model_rnn(X_batch) |
|
loss = criterion(preds, y_batch) |
|
test_loss += loss.item() |
|
|
|
_, predicted = torch.max(preds, 1) |
|
correct += (predicted == y_batch).sum().item() |
|
total += y_batch.size(0) |
|
|
|
y_true.extend(y_batch.cpu().numpy()) |
|
y_pred.extend(predicted.cpu().numpy()) |
|
|
|
test_accuracy = accuracy_score(y_true, y_pred) |
|
test_loss_avg = test_loss / len(test_loader) |
|
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0) |
|
precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0) |
|
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0) |
|
recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0) |
|
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0) |
|
f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0) |
|
report = classification_report(y_true, y_pred, target_names=label_mapping.keys(), digits=4) |
|
conf_matrix = confusion_matrix(y_true, y_pred) |
|
|
|
|
|
print(f"\nTest Loss: {test_loss_avg:.4f}, Test Accuracy: {test_accuracy:.4f}") |
|
print(f"Precision (Macro): {precision_macro:.4f}") |
|
print(f"Precision (Weighted): {precision_weighted:.4f}") |
|
print(f"Recall (Macro): {recall_macro:.4f}") |
|
print(f"Recall (Weighted): {recall_weighted:.4f}") |
|
print(f"F1-Score (Macro): {f1_macro:.4f}") |
|
print(f"F1-Score (Weighted): {f1_weighted:.4f}") |
|
|
|
print("\n========== Classification Report ==========") |
|
print(report) |
|
|
|
print("\n========== Confusion Matrix ==========") |
|
print(conf_matrix) |
|
|
|
|
|
rnn_report_dir = "rnn_emotion_model" |
|
os.makedirs(rnn_report_dir, exist_ok=True) |
|
with open(os.path.join(rnn_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f: |
|
f.write("========== Classification Report ==========\n") |
|
f.write(report) |
|
f.write("\n========== Additional Metrics ==========\n") |
|
f.write(f"Test Loss: {test_loss_avg:.4f}\n") |
|
f.write(f"Test Accuracy: {test_accuracy:.4f}\n") |
|
f.write(f"Precision (Macro): {precision_macro:.4f}\n") |
|
f.write(f"Precision (Weighted): {precision_weighted:.4f}\n") |
|
f.write(f"Recall (Macro): {recall_macro:.4f}\n") |
|
f.write(f"Recall (Weighted): {recall_weighted:.4f}\n") |
|
f.write(f"F1-Score (Macro): {f1_macro:.4f}\n") |
|
f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n") |
|
f.write("\n========== Confusion Matrix ==========\n") |
|
f.write(np.array2string(conf_matrix)) |
|
|
|
print("\n========== Classification Report saved to 'rnn_emotion_model/classification_report.txt' ==========") |
|
|
|
|
|
torch.save(model_rnn.state_dict(), os.path.join(rnn_report_dir, "simple_rnn.pth")) |
|
print("========== RNN Model saved to 'rnn_emotion_model/simple_rnn.pth' ==========") |
|
|
|
|
|
|
|
print("\n========== Training CNN-LSTM (Keras) ==========") |
|
|
|
|
|
|
|
|
|
X_train_keras, X_test_keras, y_train_keras, y_test_keras, label_mapping_keras = data_manager.split_and_convert( |
|
df, label_column="Emotion", maxlen=400, test_size=0.2, |
|
for_keras=True |
|
) |
|
|
|
maxlen = 400 |
|
vocab_size, embedding_dim = pretrained_matrix.shape |
|
|
|
|
|
pretrained_matrix_keras = pretrained_matrix.astype(np.float32) |
|
|
|
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input') |
|
emb_layer = Embedding( |
|
input_dim=vocab_size, |
|
output_dim=embedding_dim, |
|
weights=[pretrained_matrix_keras], |
|
trainable=True |
|
)(input_layer) |
|
|
|
def max_1d(X): |
|
return tf.reduce_max(X, axis=1) |
|
|
|
con3 = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer) |
|
pool_con3 = Lambda(max_1d, output_shape=(150,))(con3) |
|
|
|
con5 = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer) |
|
pool_con5 = Lambda(max_1d, output_shape=(150,))(con5) |
|
|
|
lstm_out = LSTM(128, dropout=0.3)(emb_layer) |
|
|
|
merged = concatenate([pool_con3, pool_con5, lstm_out]) |
|
dense = Dense(100, activation='relu')(merged) |
|
drop = Dropout(0.3)(dense) |
|
output = Dense(output_dim, activation='softmax')(drop) |
|
|
|
model_cnn_lstm = Model(inputs=input_layer, outputs=output) |
|
model_cnn_lstm.compile( |
|
loss='categorical_crossentropy', |
|
optimizer=Adam(lr=1e-3), |
|
metrics=['accuracy'] |
|
) |
|
|
|
checkpoint = ModelCheckpoint( |
|
'cnn_lstm_best.keras', |
|
save_best_only=True, |
|
monitor='val_accuracy', |
|
mode='max' |
|
) |
|
early_stopping = EarlyStopping( |
|
monitor='val_accuracy', |
|
patience=5, |
|
restore_best_weights=True |
|
) |
|
|
|
history = model_cnn_lstm.fit( |
|
X_train_keras, y_train_keras, |
|
validation_data=(X_test_keras, y_test_keras), |
|
epochs=30, |
|
batch_size=32, |
|
callbacks=[checkpoint, early_stopping] |
|
) |
|
|
|
|
|
loss, acc = model_cnn_lstm.evaluate(X_test_keras, y_test_keras) |
|
print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}") |
|
|
|
|
|
y_pred_cnn_lstm = model_cnn_lstm.predict(X_test_keras) |
|
y_pred_cnn_lstm = np.argmax(y_pred_cnn_lstm, axis=1) |
|
y_true_cnn_lstm = np.argmax(y_test_keras, axis=1) |
|
|
|
test_accuracy_cnn_lstm = accuracy_score(y_true_cnn_lstm, y_pred_cnn_lstm) |
|
precision_macro_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0) |
|
precision_weighted_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0) |
|
recall_macro_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0) |
|
recall_weighted_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0) |
|
f1_macro_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0) |
|
f1_weighted_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0) |
|
report_cnn_lstm = classification_report(y_true_cnn_lstm, y_pred_cnn_lstm, target_names=label_mapping.keys(), digits=4) |
|
conf_matrix_cnn_lstm = confusion_matrix(y_true_cnn_lstm, y_pred_cnn_lstm) |
|
|
|
|
|
print(f"\nCNN-LSTM Test Accuracy: {test_accuracy_cnn_lstm:.4f}") |
|
print(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}") |
|
print(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}") |
|
print(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}") |
|
print(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}") |
|
print(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}") |
|
print(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}") |
|
|
|
print("\n========== CNN-LSTM Classification Report ==========") |
|
print(report_cnn_lstm) |
|
|
|
print("\n========== CNN-LSTM Confusion Matrix ==========") |
|
print(conf_matrix_cnn_lstm) |
|
|
|
|
|
cnn_lstm_report_dir = "cnn_lstm_emotion_model" |
|
os.makedirs(cnn_lstm_report_dir, exist_ok=True) |
|
with open(os.path.join(cnn_lstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f: |
|
f.write("========== CNN-LSTM Classification Report ==========\n") |
|
f.write(report_cnn_lstm) |
|
f.write("\n========== Additional Metrics ==========\n") |
|
f.write(f"Test Loss: {loss:.4f}\n") |
|
f.write(f"Test Accuracy: {test_accuracy_cnn_lstm:.4f}\n") |
|
f.write(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}\n") |
|
f.write(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}\n") |
|
f.write(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}\n") |
|
f.write(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}\n") |
|
f.write(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}\n") |
|
f.write(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}\n") |
|
f.write("\n========== Confusion Matrix ==========\n") |
|
f.write(np.array2string(conf_matrix_cnn_lstm)) |
|
|
|
print("\n========== CNN-LSTM Classification Report saved to 'cnn_lstm_emotion_model/classification_report.txt' ==========") |
|
|
|
|
|
model_cnn_lstm.save(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras')) |
|
print(f"========== CNN-LSTM Model saved to '{cnn_lstm_report_dir}/cnn_lstm_model.keras' ==========") |
|
|
|
|
|
|
|
with open(os.path.join(rnn_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f: |
|
json.dump(label_mapping, f, ensure_ascii=False, indent=4) |
|
|
|
with open(os.path.join(rnn_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f: |
|
json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4) |
|
|
|
|
|
|
|
|
|
|
|
print("========== Label Mapping and Vocabulary saved ==========") |
|
|
|
|
|
|
|
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!" |
|
|
|
|
|
emotion_rnn = predict_emotion_rnn( |
|
model_rnn, custom_text, data_manager, label_mapping, device |
|
) |
|
print(f"Predicted Emotion (RNN): {emotion_rnn}") |
|
|
|
|
|
cnn_lstm_loaded = tf.keras.models.load_model(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras')) |
|
emotion_cnn_lstm = predict_emotion_cnn_lstm( |
|
cnn_lstm_loaded, custom_text, data_manager, label_mapping |
|
) |
|
print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}") |
|
|
|
|
|
print("TF version:", tf.__version__) |
|
print("GPU devices:", tf.config.list_physical_devices("GPU")) |
|
|
|
|