ChatBot-MentalHealth / train_model.py
Flackoooo's picture
Add Dockerfile and project files
d609fbc
import os
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import json
# Establecer la semilla para garantizar reproducibilidad
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(42)
# Funci贸n para cargar datos (simplificada para UTF-8)
def load_data(file_path):
data = pd.read_csv(file_path, encoding='utf-8')
return data
# Funci贸n para normalizar texto, manteniendo caracteres especiales
def normalize_text(text):
if isinstance(text, str):
return text.strip().upper()
return text
# Funci贸n para limpiar y preparar los datos
def clean_and_prepare_data(data):
data = data.copy()
# Eliminar filas con valores nulos
data = data.dropna(subset=['text', 'label'])
# Normalizar las etiquetas
data['label'] = data['label'].apply(normalize_text)
# Definir las etiquetas esperadas
emotion_labels = ['FELICIDAD', 'NEUTRAL', 'DEPRESI脫N', 'ANSIEDAD', 'ESTR脡S',
'EMERGENCIA', 'CONFUSI脫N', 'IRA', 'MIEDO', 'SORPRESA', 'DISGUSTO']
# Filtrar solo las etiquetas conocidas
data = data[data['label'].isin(emotion_labels)]
# Crear el mapeo de etiquetas
label_to_id = {label: idx for idx, label in enumerate(emotion_labels)}
data['label'] = data['label'].map(label_to_id)
# Verificar que no haya valores NaN
if data['label'].isna().any():
data = data.dropna(subset=['label'])
data['label'] = data['label'].astype(int)
return data, emotion_labels, label_to_id
# Funci贸n para dividir los datos
def split_data(data):
train_texts, val_texts, train_labels, val_labels = train_test_split(
data['text'], data['label'],
test_size=0.2,
stratify=data['label'],
random_state=42
)
return train_texts, val_texts, train_labels, val_labels
# Funci贸n para calcular los pesos de clase
def get_class_weights(labels):
class_weights = compute_class_weight(
class_weight='balanced',
classes=np.unique(labels),
y=labels
)
return torch.tensor(class_weights, dtype=torch.float)
# Funci贸n para tokenizar los datos (sin padding, ya que lo maneja el data collator)
def tokenize_data(tokenizer, texts, labels):
dataset = Dataset.from_dict({'text': texts.tolist(), 'label': labels.tolist()})
dataset = dataset.map(lambda batch: tokenizer(batch['text'], truncation=True, max_length=128), batched=True)
return dataset
# Funci贸n de p茅rdida personalizada que incorpora los pesos de clase
def custom_loss(labels, logits):
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
return loss_fct(logits, labels)
# Clase CustomTrainer para usar la funci贸n de p茅rdida personalizada
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.get("labels").to(model.device)
# Realizar el forward pass
outputs = model(**inputs)
logits = outputs.get("logits")
# Calcular la p茅rdida personalizada
loss = custom_loss(labels, logits)
return (loss, outputs) if return_outputs else loss
# Funci贸n para calcular m茅tricas de evaluaci贸n
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
labels = labels.astype(int)
predictions = predictions.astype(int)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average='weighted')
precision = precision_score(labels, predictions, average='weighted')
recall = recall_score(labels, predictions, average='weighted')
return {
'accuracy': accuracy,
'f1': f1,
'precision': precision,
'recall': recall
}
# Funci贸n para predecir la etiqueta de un texto dado
def predict(text):
# Tokenizar el texto
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Realizar la predicci贸n
model.eval()
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
label = id_to_label.get(predicted_class, "Etiqueta desconocida")
return label
if __name__ == '__main__':
# Configurar el dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsando dispositivo: {device}")
# Ruta del archivo CSV
current_dir = os.path.dirname(os.path.abspath(__file__))
input_file = os.path.join(current_dir, 'data', 'emotion_dataset.csv')
# Paso 1: Cargar y preparar los datos
data = load_data(input_file)
data, emotion_labels, label_to_id = clean_and_prepare_data(data)
id_to_label = {v: k for k, v in label_to_id.items()}
# Paso 2: Dividir los datos
train_texts, val_texts, train_labels, val_labels = split_data(data)
# Paso 3: Calcular los pesos de clase
class_weights = get_class_weights(train_labels).to(device)
# Paso 4: Configurar el tokenizer
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
# Paso 5: Tokenizar los datos
train_dataset = tokenize_data(tokenizer, train_texts, train_labels)
val_dataset = tokenize_data(tokenizer, val_texts, val_labels)
# Paso 6: Configurar el data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Paso 7: Configurar el modelo
model = BertForSequenceClassification.from_pretrained(
'dccuchile/bert-base-spanish-wwm-cased',
num_labels=len(emotion_labels)
)
# Paso 8: Configurar el entrenamiento
training_args = TrainingArguments(
output_dir='./models/bert_emotion_model',
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
lr_scheduler_type='linear',
warmup_steps=500,
eval_steps=500,
save_steps=500,
save_total_limit=1,
evaluation_strategy="steps",
save_strategy="steps",
logging_dir='./logs',
logging_steps=100,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
report_to="none"
)
# Paso 9: Crear el entrenador personalizado
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
data_collator=data_collator,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# Paso 10: Entrenar el modelo
trainer.train()
# Paso 11: Guardar el modelo y el tokenizer
trainer.save_model('./models/bert_emotion_model')
tokenizer.save_pretrained('./models/bert_emotion_model')
# Paso 12: Guardar los mapeos de etiquetas
with open('./models/bert_emotion_model/label_to_id.json', 'w') as f:
json.dump(label_to_id, f)
with open('./models/bert_emotion_model/id_to_label.json', 'w') as f:
json.dump(id_to_label, f)
print("\nModelo entrenado y guardado exitosamente.")
# Paso 13: Probar el modelo con un ejemplo
sample_text = "Me siento muy feliz hoy"
print(f"Texto: {sample_text}")
print(f"Predicci贸n: {predict(sample_text)}")