|
|
|
|
|
|
|
import re |
|
import emoji |
|
import json |
|
import pandas as pd |
|
import torch |
|
import numpy as np |
|
import os |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoConfig, |
|
AutoModelForSequenceClassification, |
|
Trainer, |
|
TrainingArguments |
|
) |
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.utils import resample |
|
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix |
|
|
|
|
|
|
|
|
|
|
|
def replace_emojis(sentence, emoji_mapping): |
|
processed_sentence = [] |
|
for char in sentence: |
|
if char in emoji_mapping: |
|
processed_sentence.append(emoji_mapping[char]) |
|
elif not emoji.is_emoji(char): |
|
processed_sentence.append(char) |
|
return ''.join(processed_sentence) |
|
|
|
def remove_profanity(sentence): |
|
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"] |
|
words = sentence.split() |
|
filtered = [w for w in words if w.lower() not in profane_words] |
|
return ' '.join(filtered) |
|
|
|
def remove_special_characters(sentence): |
|
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence) |
|
|
|
def normalize_whitespace(sentence): |
|
return ' '.join(sentence.split()) |
|
|
|
def remove_repeated_characters(sentence): |
|
return re.sub(r"(.)\1{2,}", r"\1", sentence) |
|
|
|
def replace_numbers(sentence): |
|
return re.sub(r"\d+", "[number]", sentence) |
|
|
|
def tokenize_underthesea(sentence): |
|
from underthesea import word_tokenize |
|
tokens = word_tokenize(sentence) |
|
return " ".join(tokens) |
|
|
|
def preprocess_sentence(sentence, abbreviations, emoji_mapping): |
|
sentence = sentence.lower() |
|
sentence = replace_emojis(sentence, emoji_mapping) |
|
sentence = remove_profanity(sentence) |
|
sentence = remove_special_characters(sentence) |
|
sentence = normalize_whitespace(sentence) |
|
|
|
words = sentence.split() |
|
replaced = [] |
|
for w in words: |
|
if w in abbreviations: |
|
replaced.append(" ".join(abbreviations[w])) |
|
else: |
|
replaced.append(w) |
|
sentence = " ".join(replaced) |
|
sentence = remove_repeated_characters(sentence) |
|
sentence = replace_numbers(sentence) |
|
|
|
sentence = tokenize_underthesea(sentence) |
|
return sentence |
|
|
|
emoji_mapping = { |
|
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]", |
|
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]", |
|
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]", |
|
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]", |
|
"🤑": "[satisfaction]", |
|
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]", |
|
"😏": "[sarcasm]", |
|
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]", |
|
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]", |
|
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]", |
|
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]", |
|
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]", |
|
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]", |
|
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]", |
|
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]", |
|
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]", |
|
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]" |
|
} |
|
|
|
def load_abbreviations(path): |
|
with open(path, "r", encoding="utf-8") as f: |
|
return json.load(f) |
|
|
|
|
|
class PhoBertEmotionDataset(torch.utils.data.Dataset): |
|
def __init__(self, encodings, labels): |
|
self.encodings = encodings |
|
self.labels = labels |
|
|
|
def __len__(self): |
|
return len(self.labels) |
|
|
|
def __getitem__(self, idx): |
|
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} |
|
item["labels"] = torch.tensor(self.labels[idx]) |
|
return item |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
file_path = "train.xlsx" |
|
abbreviations_path = "abbreviations.json" |
|
output_path = "processed_phobert.xlsx" |
|
|
|
abbreviations = load_abbreviations(abbreviations_path) |
|
|
|
df = pd.read_excel(file_path) |
|
if "Sentence" not in df.columns or "Emotion" not in df.columns: |
|
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!") |
|
|
|
|
|
df["processed_sentence"] = df["Sentence"].apply( |
|
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping) |
|
) |
|
|
|
df = df[df["processed_sentence"].str.strip().astype(bool)] |
|
|
|
print("Trước khi cân bằng:") |
|
print(df["Emotion"].value_counts()) |
|
|
|
|
|
|
|
max_count = df["Emotion"].value_counts().max() |
|
|
|
df_balanced_list = [] |
|
for emo in df["Emotion"].unique(): |
|
df_emo = df[df["Emotion"] == emo] |
|
if len(df_emo) < max_count: |
|
|
|
df_emo_oversampled = resample( |
|
df_emo, |
|
replace=True, |
|
n_samples=max_count, |
|
random_state=42 |
|
) |
|
df_balanced_list.append(df_emo_oversampled) |
|
else: |
|
|
|
df_balanced_list.append(df_emo) |
|
|
|
df = pd.concat(df_balanced_list, axis=0) |
|
df = df.sample(frac=1, random_state=42).reset_index(drop=True) |
|
|
|
print("\nSau khi cân bằng tất cả lớp:") |
|
print(df["Emotion"].value_counts()) |
|
|
|
df.to_excel(output_path, index=False) |
|
|
|
|
|
unique_labels = sorted(df["Emotion"].unique()) |
|
label2id = {label: i for i, label in enumerate(unique_labels)} |
|
id2label = {v: k for k, v in label2id.items()} |
|
|
|
df["label_id"] = df["Emotion"].map(label2id) |
|
|
|
|
|
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"]) |
|
|
|
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}") |
|
|
|
|
|
checkpoint = "vinai/phobert-base" |
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
|
|
def tokenize_texts(texts): |
|
return tokenizer( |
|
texts, |
|
padding=True, |
|
truncation=True, |
|
max_length=256 |
|
) |
|
|
|
train_texts = train_df["processed_sentence"].tolist() |
|
train_labels = train_df["label_id"].tolist() |
|
test_texts = test_df["processed_sentence"].tolist() |
|
test_labels = test_df["label_id"].tolist() |
|
|
|
train_encodings = tokenize_texts(train_texts) |
|
test_encodings = tokenize_texts(test_texts) |
|
|
|
train_dataset = PhoBertEmotionDataset(train_encodings, train_labels) |
|
test_dataset = PhoBertEmotionDataset(test_encodings, test_labels) |
|
|
|
|
|
config = AutoConfig.from_pretrained(checkpoint) |
|
config.num_labels = len(label2id) |
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
checkpoint, |
|
config=config |
|
) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./phobert_results_v2", |
|
overwrite_output_dir=True, |
|
do_train=True, |
|
do_eval=True, |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
num_train_epochs=10, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=16, |
|
learning_rate=2e-5, |
|
logging_dir="./logs", |
|
logging_steps=50, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="f1_weighted", |
|
greater_is_better=True, |
|
seed=42 |
|
) |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
preds = np.argmax(logits, axis=-1) |
|
precision_weighted = precision_score(labels, preds, average='weighted', zero_division=0) |
|
recall_weighted = recall_score(labels, preds, average='weighted', zero_division=0) |
|
f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0) |
|
precision_macro = precision_score(labels, preds, average='macro', zero_division=0) |
|
recall_macro = recall_score(labels, preds, average='macro', zero_division=0) |
|
f1_macro = f1_score(labels, preds, average='macro', zero_division=0) |
|
accuracy = accuracy_score(labels, preds) |
|
return { |
|
"accuracy": accuracy, |
|
"precision_weighted": precision_weighted, |
|
"recall_weighted": recall_weighted, |
|
"f1_weighted": f1_weighted, |
|
"precision_macro": precision_macro, |
|
"recall_macro": recall_macro, |
|
"f1_macro": f1_macro |
|
} |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=test_dataset, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
print("\n========== Training PhoBERT (balanced, more epochs) ==========") |
|
trainer.train() |
|
|
|
print("\n========== Evaluate on Test set ==========") |
|
results = trainer.evaluate(test_dataset) |
|
print("Test results:", results) |
|
|
|
|
|
print("\n========== Additional Metrics ==========") |
|
print(f"Test Loss: {results.get('eval_loss'):.4f}") |
|
print(f"Test Accuracy: {results.get('eval_accuracy'):.4f}") |
|
print(f"Precision (Macro): {results.get('eval_precision_macro'):.4f}") |
|
print(f"Precision (Weighted): {results.get('eval_precision_weighted'):.4f}") |
|
print(f"Recall (Macro): {results.get('eval_recall_macro'):.4f}") |
|
print(f"Recall (Weighted): {results.get('eval_recall_weighted'):.4f}") |
|
print(f"F1-Score (Macro): {results.get('eval_f1_macro'):.4f}") |
|
print(f"F1-Score (Weighted): {results.get('eval_f1_weighted'):.4f}") |
|
|
|
|
|
print("\n========== Detailed Classification Report ==========") |
|
predictions, labels, _ = trainer.predict(test_dataset) |
|
preds = np.argmax(predictions, axis=1) |
|
report = classification_report(labels, preds, target_names=unique_labels, digits=4) |
|
print(report) |
|
|
|
|
|
conf_matrix = confusion_matrix(labels, preds) |
|
print("\nConfusion Matrix:") |
|
print(conf_matrix) |
|
|
|
|
|
plt.figure(figsize=(10, 8)) |
|
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', |
|
xticklabels=unique_labels, |
|
yticklabels=unique_labels) |
|
plt.ylabel('Actual') |
|
plt.xlabel('Predicted') |
|
plt.title('Confusion Matrix') |
|
plt.tight_layout() |
|
confusion_matrix_path = os.path.join("phobert_emotion_model", "confusion_matrix.png") |
|
os.makedirs("phobert_emotion_model", exist_ok=True) |
|
plt.savefig(confusion_matrix_path) |
|
plt.close() |
|
print(f"\nConfusion Matrix plot saved to '{confusion_matrix_path}'") |
|
|
|
|
|
report_path = os.path.join("phobert_emotion_model", "classification_report.txt") |
|
with open(report_path, "w", encoding="utf-8") as f: |
|
f.write("========== Classification Report ==========\n") |
|
f.write(report) |
|
f.write("\n========== Confusion Matrix ==========\n") |
|
f.write(np.array2string(conf_matrix)) |
|
|
|
print(f"\nClassification Report saved to '{report_path}'") |
|
|
|
|
|
model_output_dir = "./phobert_emotion_model" |
|
os.makedirs(model_output_dir, exist_ok=True) |
|
model.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model")) |
|
tokenizer.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model")) |
|
with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f: |
|
json.dump(id2label, f, ensure_ascii=False, indent=4) |
|
|
|
print("\n========== Model and Tokenizer saved ==========") |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
def predict_text(text): |
|
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping) |
|
enc = tokenizer(text_proc, padding=True, truncation=True, max_length=256, return_tensors="pt") |
|
enc = {k: v.to(device) for k, v in enc.items()} |
|
with torch.no_grad(): |
|
out = model(**enc) |
|
pred_id = out.logits.argmax(dim=-1).item() |
|
return id2label[pred_id] |
|
|
|
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!" |
|
emotion_pred = predict_text(custom_text) |
|
print("\nCâu ví dụ:", custom_text) |
|
print("Dự đoán cảm xúc:", emotion_pred) |
|
|
|
print("\nHoàn thành demo PhoBERT với cân bằng dữ liệu & nhiều epoch hơn!") |
|
|