ChatBot-MentalHealth / train_model.py
Flackoooo's picture
Add Dockerfile and project files
d609fbc
raw
history blame
7.9 kB
import os
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np
import random
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import json
# Establecer la semilla para garantizar reproducibilidad
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed(42)
# Funci贸n para cargar datos (simplificada para UTF-8)
def load_data(file_path):
data = pd.read_csv(file_path, encoding='utf-8')
return data
# Funci贸n para normalizar texto, manteniendo caracteres especiales
def normalize_text(text):
if isinstance(text, str):
return text.strip().upper()
return text
# Funci贸n para limpiar y preparar los datos
def clean_and_prepare_data(data):
data = data.copy()
# Eliminar filas con valores nulos
data = data.dropna(subset=['text', 'label'])
# Normalizar las etiquetas
data['label'] = data['label'].apply(normalize_text)
# Definir las etiquetas esperadas
emotion_labels = ['FELICIDAD', 'NEUTRAL', 'DEPRESI脫N', 'ANSIEDAD', 'ESTR脡S',
'EMERGENCIA', 'CONFUSI脫N', 'IRA', 'MIEDO', 'SORPRESA', 'DISGUSTO']
# Filtrar solo las etiquetas conocidas
data = data[data['label'].isin(emotion_labels)]
# Crear el mapeo de etiquetas
label_to_id = {label: idx for idx, label in enumerate(emotion_labels)}
data['label'] = data['label'].map(label_to_id)
# Verificar que no haya valores NaN
if data['label'].isna().any():
data = data.dropna(subset=['label'])
data['label'] = data['label'].astype(int)
return data, emotion_labels, label_to_id
# Funci贸n para dividir los datos
def split_data(data):
train_texts, val_texts, train_labels, val_labels = train_test_split(
data['text'], data['label'],
test_size=0.2,
stratify=data['label'],
random_state=42
)
return train_texts, val_texts, train_labels, val_labels
# Funci贸n para calcular los pesos de clase
def get_class_weights(labels):
class_weights = compute_class_weight(
class_weight='balanced',
classes=np.unique(labels),
y=labels
)
return torch.tensor(class_weights, dtype=torch.float)
# Funci贸n para tokenizar los datos (sin padding, ya que lo maneja el data collator)
def tokenize_data(tokenizer, texts, labels):
dataset = Dataset.from_dict({'text': texts.tolist(), 'label': labels.tolist()})
dataset = dataset.map(lambda batch: tokenizer(batch['text'], truncation=True, max_length=128), batched=True)
return dataset
# Funci贸n de p茅rdida personalizada que incorpora los pesos de clase
def custom_loss(labels, logits):
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
return loss_fct(logits, labels)
# Clase CustomTrainer para usar la funci贸n de p茅rdida personalizada
from transformers import Trainer
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.get("labels").to(model.device)
# Realizar el forward pass
outputs = model(**inputs)
logits = outputs.get("logits")
# Calcular la p茅rdida personalizada
loss = custom_loss(labels, logits)
return (loss, outputs) if return_outputs else loss
# Funci贸n para calcular m茅tricas de evaluaci贸n
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
labels = labels.astype(int)
predictions = predictions.astype(int)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average='weighted')
precision = precision_score(labels, predictions, average='weighted')
recall = recall_score(labels, predictions, average='weighted')
return {
'accuracy': accuracy,
'f1': f1,
'precision': precision,
'recall': recall
}
# Funci贸n para predecir la etiqueta de un texto dado
def predict(text):
# Tokenizar el texto
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Realizar la predicci贸n
model.eval()
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
label = id_to_label.get(predicted_class, "Etiqueta desconocida")
return label
if __name__ == '__main__':
# Configurar el dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsando dispositivo: {device}")
# Ruta del archivo CSV
current_dir = os.path.dirname(os.path.abspath(__file__))
input_file = os.path.join(current_dir, 'data', 'emotion_dataset.csv')
# Paso 1: Cargar y preparar los datos
data = load_data(input_file)
data, emotion_labels, label_to_id = clean_and_prepare_data(data)
id_to_label = {v: k for k, v in label_to_id.items()}
# Paso 2: Dividir los datos
train_texts, val_texts, train_labels, val_labels = split_data(data)
# Paso 3: Calcular los pesos de clase
class_weights = get_class_weights(train_labels).to(device)
# Paso 4: Configurar el tokenizer
tokenizer = BertTokenizerFast.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
# Paso 5: Tokenizar los datos
train_dataset = tokenize_data(tokenizer, train_texts, train_labels)
val_dataset = tokenize_data(tokenizer, val_texts, val_labels)
# Paso 6: Configurar el data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Paso 7: Configurar el modelo
model = BertForSequenceClassification.from_pretrained(
'dccuchile/bert-base-spanish-wwm-cased',
num_labels=len(emotion_labels)
)
# Paso 8: Configurar el entrenamiento
training_args = TrainingArguments(
output_dir='./models/bert_emotion_model',
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
lr_scheduler_type='linear',
warmup_steps=500,
eval_steps=500,
save_steps=500,
save_total_limit=1,
evaluation_strategy="steps",
save_strategy="steps",
logging_dir='./logs',
logging_steps=100,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
report_to="none"
)
# Paso 9: Crear el entrenador personalizado
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
data_collator=data_collator,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# Paso 10: Entrenar el modelo
trainer.train()
# Paso 11: Guardar el modelo y el tokenizer
trainer.save_model('./models/bert_emotion_model')
tokenizer.save_pretrained('./models/bert_emotion_model')
# Paso 12: Guardar los mapeos de etiquetas
with open('./models/bert_emotion_model/label_to_id.json', 'w') as f:
json.dump(label_to_id, f)
with open('./models/bert_emotion_model/id_to_label.json', 'w') as f:
json.dump(id_to_label, f)
print("\nModelo entrenado y guardado exitosamente.")
# Paso 13: Probar el modelo con un ejemplo
sample_text = "Me siento muy feliz hoy"
print(f"Texto: {sample_text}")
print(f"Predicci贸n: {predict(sample_text)}")