emotion-classification-v1 / main_RNN_CNN-LSTM.py

ducdatit2002

Upload folder using huggingface_hub

e09333c verified 5 months ago

29.2 kB

	# thesis.py
	# -- coding: utf-8 --

	import pandas as pd
	import emoji
	import json
	import re
	import numpy as np
	from underthesea import word_tokenize
	from tqdm import tqdm
	import torch
	from torchtext.vocab import Vectors
	from sklearn.model_selection import train_test_split
	from sklearn.utils import resample
	from sklearn.metrics import (
	accuracy_score,
	classification_report,
	precision_score,
	recall_score,
	f1_score,
	confusion_matrix
	)
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from torch.utils.data import DataLoader, TensorDataset
	import torch.nn as nn
	import torch.optim as optim
	import tensorflow as tf
	import os
	import joblib

	# ========== CÁC HÀM TIỀN XỬ LÝ ==========

	def preprocess_sentence(sentence, abbreviations, emoji_mapping):
	"""
	Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
	ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
	"""
	sentence = sentence.lower()
	sentence = replace_emojis(sentence, emoji_mapping)
	sentence = remove_profanity(sentence)
	sentence = remove_special_characters(sentence)
	sentence = normalize_whitespace(sentence)
	sentence = replace_abbreviations(sentence, abbreviations)
	sentence = remove_repeated_characters(sentence)
	sentence = replace_numbers(sentence)
	sentence = tokenize_sentence(sentence)
	return sentence

	def replace_emojis(sentence, emoji_mapping):
	processed_sentence = []
	for char in sentence:
	if char in emoji_mapping:
	processed_sentence.append(emoji_mapping[char])
	elif not emoji.is_emoji(char):
	processed_sentence.append(char)
	return ''.join(processed_sentence)

	def remove_profanity(sentence):
	profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
	words = sentence.split()
	filtered_words = [word for word in words if word.lower() not in profane_words]
	return ' '.join(filtered_words)

	def remove_special_characters(sentence):
	return re.sub(r"[\^\*@#&$%<>~{}\|\\]", "", sentence)

	def normalize_whitespace(sentence):
	return ' '.join(sentence.split())

	def replace_abbreviations(sentence, abbreviations):
	words = sentence.split()
	replaced_words = [
	" ".join(abbreviations[word]) if word in abbreviations else word
	for word in words
	]
	return ' '.join(replaced_words)

	def remove_repeated_characters(sentence):
	# Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
	return re.sub(r"(.)\1{2,}", r"\1", sentence)

	def replace_numbers(sentence):
	# Thay toàn bộ số bằng token [number]
	return re.sub(r"\d+", "[number]", sentence)

	def tokenize_sentence(sentence):
	# Tách từ bằng underthesea
	return ' '.join(word_tokenize(sentence))


	# ========== VOCABULARY CLASS ==========

	class Vocabulary:
	def __init__(self):
	self.word2id = {}
	self.word2id['<pad>'] = 0
	self.word2id['<unk>'] = 1
	self.unk_id = 1
	self.id2word = {0: '<pad>', 1: '<unk>'}

	def __getitem__(self, word):
	return self.word2id.get(word, self.unk_id)

	def __contains__(self, word):
	return word in self.word2id

	def __len__(self):
	return len(self.word2id)

	def lookup_tokens(self, indices):
	return [self.id2word[idx] for idx in indices]

	def add(self, word):
	if word not in self.word2id:
	idx = len(self.word2id)
	self.word2id[word] = idx
	self.id2word[idx] = word

	@staticmethod
	def tokenize_corpus(corpus):
	tokenized_corpus = []
	for doc in tqdm(corpus, desc="Tokenizing Corpus"):
	tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
	tokenized_corpus.append(tokens)
	return tokenized_corpus

	def corpus_to_tensor(self, corpus, is_tokenized=False):
	"""
	corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
	return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
	"""
	tokenized_corpus = (
	self.tokenize_corpus(corpus) if not is_tokenized else corpus
	)
	return [
	[self[token] for token in doc]
	for doc in tokenized_corpus
	]


	# ========== EMOJI MAPPING ==========

	emoji_mapping = {
	"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
	"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
	"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
	"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
	"🤑": "[satisfaction]",
	"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
	"😏": "[sarcasm]",
	"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
	"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
	"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
	"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
	"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
	"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
	"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
	"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
	"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
	"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
	}

	def load_abbreviations(path):
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)


	# ========== DATA MANAGER ==========

	class DataManager:
	def __init__(self, file_path, abbreviations_path, word2vec_path):
	self.file_path = file_path
	self.abbreviations_path = abbreviations_path
	self.word2vec_path = word2vec_path
	self.vocabulary = None
	self.word_embeddings = None
	self.abbreviations = None
	self.load_abbreviations()

	def load_abbreviations(self):
	with open(self.abbreviations_path, "r", encoding="utf-8") as f:
	self.abbreviations = json.load(f)

	def load_word2vec(self):
	"""
	Tải vector từ file word2vec,
	dùng torchtext.Vectors để load embedding pretrained.
	"""
	self.word_embeddings = Vectors(
	name=self.word2vec_path,
	unk_init=torch.Tensor.normal_
	)

	def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
	"""
	Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
	"""
	vocab = Vocabulary()
	from collections import Counter
	counter = Counter()

	for sent in corpus:
	for token in sent.split():
	counter[token] += 1

	most_common = counter.most_common(max_vocab_size)
	for word, _freq in most_common:
	vocab.add(word)

	return vocab

	def preprocess_data(self):
	df = pd.read_excel(self.file_path)
	if "Sentence" not in df.columns:
	raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")

	# Tiền xử lý từng câu
	df["processed_sentence"] = df["Sentence"].apply(
	lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
	)

	# Loại những dòng rỗng
	df = df[df["processed_sentence"].str.strip().astype(bool)]

	# Tạo vocab từ chính dữ liệu
	all_sentences = df["processed_sentence"].tolist()
	self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)

	# Load word2vec
	self.load_word2vec()

	return df

	def build_pretrained_embedding_matrix(self, embedding_dim=100):
	"""
	Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
	với trọng số pretrained.
	"""
	vocab_size = len(self.vocabulary)
	weight_matrix = np.random.normal(
	scale=0.1, size=(vocab_size, embedding_dim)
	).astype(np.float32)

	# Copy vector pretrained
	for word, idx in self.vocabulary.word2id.items():
	if word in self.word_embeddings.stoi:
	weight_matrix[idx] = self.word_embeddings.vectors[
	self.word_embeddings.stoi[word]
	]

	return weight_matrix

	def split_and_convert(
	self, df, label_column="Emotion", maxlen=400, test_size=0.2,
	for_keras=False, batch_size=32
	):
	"""
	Chia dữ liệu thành train/test.
	- for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
	- for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
	"""
	if label_column not in df.columns:
	raise ValueError(
	f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
	)

	# Tạo mapping nhãn -> số
	label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
	df[label_column] = df[label_column].map(label_mapping)
	if df[label_column].isnull().any():
	missing = df[df[label_column].isnull()][label_column].unique()
	raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")

	X = df["processed_sentence"].tolist()
	y = df[label_column].tolist()

	# Stratify to maintain class distribution
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=test_size, random_state=42, stratify=y
	)

	# Convert text -> index
	X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
	X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)

	# Pad
	X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
	X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')

	print(">>> Debug Split and Convert:")
	print("X_train_padded.shape:", X_train_padded.shape)
	print("X_test_padded.shape: ", X_test_padded.shape)
	print("y_train length:", len(y_train))
	print("y_test length: ", len(y_test))
	print("vocab_size:", len(self.vocabulary))

	if for_keras:
	num_classes = len(label_mapping)
	y_train_onehot = torch.nn.functional.one_hot(
	torch.tensor(y_train),
	num_classes=num_classes
	).numpy()
	y_test_onehot = torch.nn.functional.one_hot(
	torch.tensor(y_test),
	num_classes=num_classes
	).numpy()

	print("y_train_onehot.shape:", y_train_onehot.shape)
	print("y_test_onehot.shape: ", y_test_onehot.shape)

	return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
	else:
	# Trả về DataLoader
	X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
	X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
	y_train_t = torch.tensor(y_train, dtype=torch.long)
	y_test_t = torch.tensor(y_test, dtype=torch.long)

	train_ds = TensorDataset(X_train_t, y_train_t)
	test_ds = TensorDataset(X_test_t, y_test_t)

	train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
	test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

	return train_loader, test_loader, label_mapping


	# ========== MÔ HÌNH PYTORCH RNN ==========

	class SimpleRNN(nn.Module):
	def __init__(self, pretrained_weight, hidden_dim, output_dim, dropout=0.3):
	super(SimpleRNN, self).__init__()
	vocab_size, embedding_dim = pretrained_weight.shape
	# Tạo nn.Embedding từ pretrained_weight
	self.embedding = nn.Embedding.from_pretrained(
	torch.from_numpy(pretrained_weight),
	freeze=False # True nếu muốn cố định embedding
	)
	self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
	self.dropout = nn.Dropout(dropout)
	self.fc = nn.Linear(hidden_dim, output_dim)

	def forward(self, x):
	embedded = self.dropout(self.embedding(x))
	_, (hidden, _) = self.rnn(embedded)
	hidden = self.dropout(hidden.squeeze(0))
	output = self.fc(hidden)
	return output


	def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
	model.eval()
	with torch.no_grad():
	processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
	tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
	text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
	text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
	text_tensor = torch.tensor(
	text_padded,
	dtype=torch.long
	).to(device)

	output = model(text_tensor)
	_, predicted = torch.max(output, 1)
	rev_map = {v: k for k, v in label_mapping.items()}
	return rev_map[predicted.item()]


	# ========== MÔ HÌNH KERAS CNN-LSTM ==========

	def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
	processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
	tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
	text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
	text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
	output = model.predict(text_padded)
	pred = output.argmax(axis=1)[0]
	rev_map = {v: k for k, v in label_mapping.items()}
	return rev_map[pred]


	# ========== MAIN ==========

	if __name__ == "__main__":
	from keras.models import Model
	from keras.layers import (
	Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
	)
	from keras.optimizers import Adam
	from keras.callbacks import ModelCheckpoint, EarlyStopping

	# -------- ĐƯỜNG DẪN ----------
	file_path = "train.xlsx"
	abbreviations_path = "abbreviations.json"
	word2vec_path = "word2vec_vi_syllables_100dims.txt"
	output_path = "processed.xlsx"

	# Khởi tạo DataManager
	data_manager = DataManager(
	file_path=file_path,
	abbreviations_path=abbreviations_path,
	word2vec_path=word2vec_path
	)

	# 1) Tiền xử lý, tạo vocab, load word2vec
	df = data_manager.preprocess_data()
	print("Trước khi cân bằng lớp (undersampling/oversampling):")
	print(df["Emotion"].value_counts())

	# 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
	# Bạn có thể điều chỉnh theo nhu cầu của mình
	df_enjoyment = df[df["Emotion"] == "Enjoyment"]
	df_other = df[df["Emotion"] == "Other"]
	df_anger = df[df["Emotion"] == "Anger"]
	df_sadness = df[df["Emotion"] == "Sadness"]
	df_disgust = df[df["Emotion"] == "Disgust"]
	df_fear = df[df["Emotion"] == "Fear"]
	df_surprise = df[df["Emotion"] == "Surprise"]

	# Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
	if len(df_other) < 3000:
	df_other_oversampled = resample(
	df_other,
	replace=True,
	n_samples=3000,
	random_state=42
	)
	else:
	df_other_oversampled = df_other

	# Giữ nguyên các lớp khác (hoặc oversample tùy ý)
	df_balanced = pd.concat([
	df_enjoyment,
	df_other_oversampled,
	df_anger,
	df_sadness,
	df_disgust,
	df_fear,
	df_surprise
	], axis=0)

	df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
	df = df_balanced

	print("\nSau khi cân bằng lớp (demo oversample):")
	print(df["Emotion"].value_counts())

	# Xuất file (nếu muốn)
	df.to_excel(output_path, index=False)

	# ========== TRAIN RNN PYTORCH ==========

	print("\n========== Training PyTorch SimpleRNN ==========")

	# Xây ma trận embedding pretrained
	pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)

	# Chia và chuyển đổi dữ liệu thành DataLoader
	train_loader, test_loader, label_mapping = data_manager.split_and_convert(
	df, label_column="Emotion", maxlen=400, test_size=0.2,
	for_keras=False, batch_size=32
	)

	hidden_dim = 128
	output_dim = len(label_mapping)

	model_rnn = SimpleRNN(pretrained_weight=pretrained_matrix,
	hidden_dim=hidden_dim,
	output_dim=output_dim,
	dropout=0.3)
	criterion = nn.CrossEntropyLoss()
	optimizer = optim.Adam(model_rnn.parameters(), lr=1e-3)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model_rnn.to(device)

	num_epochs = 20
	for epoch in range(num_epochs):
	model_rnn.train()
	epoch_loss = 0
	correct = 0
	total = 0

	for X_batch, y_batch in train_loader:
	X_batch = X_batch.to(device)
	y_batch = y_batch.to(device)

	optimizer.zero_grad()
	preds = model_rnn(X_batch)
	loss = criterion(preds, y_batch)
	loss.backward()
	optimizer.step()

	epoch_loss += loss.item()
	_, pred_label = torch.max(preds, 1)
	correct += (pred_label == y_batch).sum().item()
	total += y_batch.size(0)

	epoch_accuracy = correct / total
	epoch_loss_avg = epoch_loss / len(train_loader)
	print(f"Epoch {epoch+1}/{num_epochs}, "
	f"Loss: {epoch_loss_avg:.4f}, "
	f"Accuracy: {epoch_accuracy:.4f}")

	# Đánh giá trên test set với detailed metrics
	model_rnn.eval()
	test_loss = 0
	correct = 0
	total = 0
	y_true = []
	y_pred = []
	with torch.no_grad():
	for X_batch, y_batch in test_loader:
	X_batch = X_batch.to(device)
	y_batch = y_batch.to(device)
	preds = model_rnn(X_batch)
	loss = criterion(preds, y_batch)
	test_loss += loss.item()

	_, predicted = torch.max(preds, 1)
	correct += (predicted == y_batch).sum().item()
	total += y_batch.size(0)

	y_true.extend(y_batch.cpu().numpy())
	y_pred.extend(predicted.cpu().numpy())

	test_accuracy = accuracy_score(y_true, y_pred)
	test_loss_avg = test_loss / len(test_loader)
	precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
	precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
	recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
	recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
	f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
	f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
	report = classification_report(y_true, y_pred, target_names=label_mapping.keys(), digits=4)
	conf_matrix = confusion_matrix(y_true, y_pred)

	# In các chỉ số
	print(f"\nTest Loss: {test_loss_avg:.4f}, Test Accuracy: {test_accuracy:.4f}")
	print(f"Precision (Macro): {precision_macro:.4f}")
	print(f"Precision (Weighted): {precision_weighted:.4f}")
	print(f"Recall (Macro): {recall_macro:.4f}")
	print(f"Recall (Weighted): {recall_weighted:.4f}")
	print(f"F1-Score (Macro): {f1_macro:.4f}")
	print(f"F1-Score (Weighted): {f1_weighted:.4f}")

	print("\n========== Classification Report ==========")
	print(report)

	print("\n========== Confusion Matrix ==========")
	print(conf_matrix)

	# Lưu báo cáo vào file
	rnn_report_dir = "rnn_emotion_model"
	os.makedirs(rnn_report_dir, exist_ok=True)
	with open(os.path.join(rnn_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
	f.write("========== Classification Report ==========\n")
	f.write(report)
	f.write("\n========== Additional Metrics ==========\n")
	f.write(f"Test Loss: {test_loss_avg:.4f}\n")
	f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
	f.write(f"Precision (Macro): {precision_macro:.4f}\n")
	f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
	f.write(f"Recall (Macro): {recall_macro:.4f}\n")
	f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
	f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
	f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
	f.write("\n========== Confusion Matrix ==========\n")
	f.write(np.array2string(conf_matrix))

	print("\n========== Classification Report saved to 'rnn_emotion_model/classification_report.txt' ==========")

	# Lưu mô hình RNN
	torch.save(model_rnn.state_dict(), os.path.join(rnn_report_dir, "simple_rnn.pth"))
	print("========== RNN Model saved to 'rnn_emotion_model/simple_rnn.pth' ==========")

	# ========== TRAIN CNN-LSTM KERAS ==========

	print("\n========== Training CNN-LSTM (Keras) ==========")

	# Tạo embedding pretrained cho Keras
	# Chúng ta có pretrained_matrix (num_vocab x 100)
	# Sẽ truyền vào layer Embedding(..., weights=[...])
	X_train_keras, X_test_keras, y_train_keras, y_test_keras, label_mapping_keras = data_manager.split_and_convert(
	df, label_column="Emotion", maxlen=400, test_size=0.2,
	for_keras=True
	)

	maxlen = 400
	vocab_size, embedding_dim = pretrained_matrix.shape

	# Chuyển pretrained_matrix -> float32 (đảm bảo Keras nhận dạng)
	pretrained_matrix_keras = pretrained_matrix.astype(np.float32)

	input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
	emb_layer = Embedding(
	input_dim=vocab_size,
	output_dim=embedding_dim,
	weights=[pretrained_matrix_keras],
	trainable=True # True hoặc False tùy muốn fine-tune embedding
	)(input_layer)

	def max_1d(X):
	return tf.reduce_max(X, axis=1)

	con3 = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
	pool_con3 = Lambda(max_1d, output_shape=(150,))(con3)

	con5 = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
	pool_con5 = Lambda(max_1d, output_shape=(150,))(con5)

	lstm_out = LSTM(128, dropout=0.3)(emb_layer)

	merged = concatenate([pool_con3, pool_con5, lstm_out])
	dense = Dense(100, activation='relu')(merged)
	drop = Dropout(0.3)(dense)
	output = Dense(output_dim, activation='softmax')(drop)

	model_cnn_lstm = Model(inputs=input_layer, outputs=output)
	model_cnn_lstm.compile(
	loss='categorical_crossentropy',
	optimizer=Adam(lr=1e-3),
	metrics=['accuracy']
	)

	checkpoint = ModelCheckpoint(
	'cnn_lstm_best.keras',
	save_best_only=True,
	monitor='val_accuracy',
	mode='max'
	)
	early_stopping = EarlyStopping(
	monitor='val_accuracy',
	patience=5,
	restore_best_weights=True
	)

	history = model_cnn_lstm.fit(
	X_train_keras, y_train_keras,
	validation_data=(X_test_keras, y_test_keras),
	epochs=30,
	batch_size=32,
	callbacks=[checkpoint, early_stopping]
	)

	# Đánh giá trên test set với detailed metrics
	loss, acc = model_cnn_lstm.evaluate(X_test_keras, y_test_keras)
	print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")

	# Thu thập dự đoán và tính toán các chỉ số
	y_pred_cnn_lstm = model_cnn_lstm.predict(X_test_keras)
	y_pred_cnn_lstm = np.argmax(y_pred_cnn_lstm, axis=1)
	y_true_cnn_lstm = np.argmax(y_test_keras, axis=1)

	test_accuracy_cnn_lstm = accuracy_score(y_true_cnn_lstm, y_pred_cnn_lstm)
	precision_macro_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
	precision_weighted_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
	recall_macro_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
	recall_weighted_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
	f1_macro_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
	f1_weighted_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
	report_cnn_lstm = classification_report(y_true_cnn_lstm, y_pred_cnn_lstm, target_names=label_mapping.keys(), digits=4)
	conf_matrix_cnn_lstm = confusion_matrix(y_true_cnn_lstm, y_pred_cnn_lstm)

	# In các chỉ số
	print(f"\nCNN-LSTM Test Accuracy: {test_accuracy_cnn_lstm:.4f}")
	print(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}")
	print(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}")
	print(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}")
	print(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}")
	print(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}")
	print(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}")

	print("\n========== CNN-LSTM Classification Report ==========")
	print(report_cnn_lstm)

	print("\n========== CNN-LSTM Confusion Matrix ==========")
	print(conf_matrix_cnn_lstm)

	# Lưu báo cáo vào file
	cnn_lstm_report_dir = "cnn_lstm_emotion_model"
	os.makedirs(cnn_lstm_report_dir, exist_ok=True)
	with open(os.path.join(cnn_lstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
	f.write("========== CNN-LSTM Classification Report ==========\n")
	f.write(report_cnn_lstm)
	f.write("\n========== Additional Metrics ==========\n")
	f.write(f"Test Loss: {loss:.4f}\n")
	f.write(f"Test Accuracy: {test_accuracy_cnn_lstm:.4f}\n")
	f.write(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}\n")
	f.write(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}\n")
	f.write(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}\n")
	f.write(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}\n")
	f.write(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}\n")
	f.write(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}\n")
	f.write("\n========== Confusion Matrix ==========\n")
	f.write(np.array2string(conf_matrix_cnn_lstm))

	print("\n========== CNN-LSTM Classification Report saved to 'cnn_lstm_emotion_model/classification_report.txt' ==========")

	# Lưu mô hình CNN-LSTM
	model_cnn_lstm.save(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
	print(f"========== CNN-LSTM Model saved to '{cnn_lstm_report_dir}/cnn_lstm_model.keras' ==========")

	# ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
	# Lưu label_mapping và vocabulary cho RNN
	with open(os.path.join(rnn_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
	json.dump(label_mapping, f, ensure_ascii=False, indent=4)

	with open(os.path.join(rnn_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
	json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)

	# Lưu label_mapping và vocabulary cho CNN-LSTM
	# Giả sử label_mapping và vocabulary giống nhau, bạn có thể chỉ lưu một lần.
	# Nếu khác, hãy điều chỉnh tương ứng.

	print("========== Label Mapping and Vocabulary saved ==========")

	# ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========

	custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"

	# RNN (PyTorch)
	emotion_rnn = predict_emotion_rnn(
	model_rnn, custom_text, data_manager, label_mapping, device
	)
	print(f"Predicted Emotion (RNN): {emotion_rnn}")

	# CNN-LSTM (Keras)
	cnn_lstm_loaded = tf.keras.models.load_model(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
	emotion_cnn_lstm = predict_emotion_cnn_lstm(
	cnn_lstm_loaded, custom_text, data_manager, label_mapping
	)
	print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")

	# Kiểm tra TF, GPU
	print("TF version:", tf.__version__)
	print("GPU devices:", tf.config.list_physical_devices("GPU"))
	# os.system("nvidia-smi") # nếu muốn xem info GPU