Spaces:
Runtime error
Runtime error
import streamlit as st | |
import threading | |
import random | |
import time | |
from datetime import datetime | |
from utils import add_log, timestamp | |
# Handle missing dependencies | |
try: | |
import torch | |
import pandas as pd | |
from transformers import TrainingArguments as HFTrainingArguments | |
from transformers import Trainer, AutoModelForCausalLM, AutoTokenizer | |
from datasets import Dataset, DatasetDict | |
TRANSFORMERS_AVAILABLE = True | |
except ImportError: | |
TRANSFORMERS_AVAILABLE = False | |
HFTrainingArguments = None | |
# For demo purposes | |
class DummyTrainer: | |
def __init__(self, **kwargs): | |
self.callback = type('obj', (object,), {'__init__': lambda self: None}) | |
def train(self): | |
pass | |
def initialize_training_progress(model_id): | |
""" | |
Initialize training progress tracking for a model. | |
Args: | |
model_id: Identifier for the model | |
""" | |
if 'training_progress' not in st.session_state: | |
st.session_state.training_progress = {} | |
st.session_state.training_progress[model_id] = { | |
'status': 'initialized', | |
'current_epoch': 0, | |
'total_epochs': 0, | |
'loss_history': [], | |
'started_at': timestamp(), | |
'completed_at': None, | |
'progress': 0.0 | |
} | |
def update_training_progress(model_id, epoch=None, loss=None, status=None, progress=None, total_epochs=None): | |
""" | |
Update training progress for a model. | |
Args: | |
model_id: Identifier for the model | |
epoch: Current epoch | |
loss: Current loss value | |
status: Training status | |
progress: Progress percentage (0-100) | |
total_epochs: Total number of epochs | |
""" | |
if 'training_progress' not in st.session_state or model_id not in st.session_state.training_progress: | |
initialize_training_progress(model_id) | |
progress_data = st.session_state.training_progress[model_id] | |
if epoch is not None: | |
progress_data['current_epoch'] = epoch | |
if loss is not None: | |
progress_data['loss_history'].append(loss) | |
if status is not None: | |
progress_data['status'] = status | |
if status == 'completed': | |
progress_data['completed_at'] = timestamp() | |
progress_data['progress'] = 100.0 | |
if progress is not None: | |
progress_data['progress'] = progress | |
if total_epochs is not None: | |
progress_data['total_epochs'] = total_epochs | |
def tokenize_dataset(dataset, tokenizer, max_length=512): | |
""" | |
Tokenize a dataset for model training. | |
Args: | |
dataset: The dataset to tokenize | |
tokenizer: The tokenizer to use | |
max_length: Maximum sequence length | |
Returns: | |
Dataset: Tokenized dataset | |
""" | |
def tokenize_function(examples): | |
return tokenizer(examples['code'], padding='max_length', truncation=True, max_length=max_length) | |
tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
return tokenized_dataset | |
def train_model_thread(model_id, dataset_name, base_model_name, training_args, device, stop_event): | |
""" | |
Thread function for training a model. | |
Args: | |
model_id: Identifier for the model | |
dataset_name: Name of the dataset to use | |
base_model_name: Base model from Hugging Face | |
training_args: Training arguments | |
device: Device to use for training (cpu/cuda) | |
stop_event: Threading event to signal stopping | |
""" | |
try: | |
# Get dataset | |
dataset = st.session_state.datasets[dataset_name]['data'] | |
# Initialize model and tokenizer | |
add_log(f"Initializing model {base_model_name} for training") | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
model = AutoModelForCausalLM.from_pretrained(base_model_name) | |
# Check if tokenizer has padding token | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
model.config.pad_token_id = model.config.eos_token_id | |
# Tokenize dataset | |
add_log(f"Tokenizing dataset {dataset_name}") | |
train_dataset = tokenize_dataset(dataset['train'], tokenizer) | |
val_dataset = tokenize_dataset(dataset['validation'], tokenizer) | |
# Update training progress | |
update_training_progress( | |
model_id, | |
status='running', | |
total_epochs=training_args.num_train_epochs | |
) | |
# Define custom callback to track progress | |
class CustomCallback(Trainer.callback): | |
def on_epoch_end(self, args, state, control, **kwargs): | |
current_epoch = state.epoch | |
epoch_loss = state.log_history[-1].get('loss', 0) | |
update_training_progress( | |
model_id, | |
epoch=current_epoch, | |
loss=epoch_loss, | |
progress=(current_epoch / training_args.num_train_epochs) * 100 | |
) | |
add_log(f"Epoch {current_epoch}/{training_args.num_train_epochs} completed. Loss: {epoch_loss:.4f}") | |
# Check if training should be stopped | |
if stop_event.is_set(): | |
add_log(f"Training for model {model_id} was manually stopped") | |
control.should_training_stop = True | |
# Configure training arguments | |
args = HFTrainingArguments( | |
output_dir=f"./results/{model_id}", | |
evaluation_strategy="epoch", | |
learning_rate=training_args.learning_rate, | |
per_device_train_batch_size=training_args.batch_size, | |
per_device_eval_batch_size=training_args.batch_size, | |
num_train_epochs=training_args.num_train_epochs, | |
weight_decay=0.01, | |
save_total_limit=1, | |
) | |
# Initialize trainer | |
trainer = Trainer( | |
model=model, | |
args=args, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset, | |
tokenizer=tokenizer, | |
callbacks=[CustomCallback] | |
) | |
# Train the model | |
add_log(f"Starting training for model {model_id}") | |
trainer.train() | |
# Save the model | |
if not stop_event.is_set(): | |
add_log(f"Training completed for model {model_id}") | |
update_training_progress(model_id, status='completed') | |
# Save to session state | |
st.session_state.trained_models[model_id] = { | |
'model': model, | |
'tokenizer': tokenizer, | |
'info': { | |
'id': model_id, | |
'base_model': base_model_name, | |
'dataset': dataset_name, | |
'created_at': timestamp(), | |
'epochs': training_args.num_train_epochs, | |
'learning_rate': training_args.learning_rate, | |
'batch_size': training_args.batch_size | |
} | |
} | |
except Exception as e: | |
add_log(f"Error during training model {model_id}: {str(e)}", "ERROR") | |
update_training_progress(model_id, status='failed') | |
class TrainingArguments: | |
def __init__(self, learning_rate, batch_size, num_train_epochs): | |
self.learning_rate = learning_rate | |
self.batch_size = batch_size | |
self.num_train_epochs = num_train_epochs | |
def start_model_training(model_id, dataset_name, base_model_name, learning_rate, batch_size, epochs): | |
""" | |
Start model training in a separate thread. | |
Args: | |
model_id: Identifier for the model | |
dataset_name: Name of the dataset to use | |
base_model_name: Base model from Hugging Face | |
learning_rate: Learning rate for training | |
batch_size: Batch size for training | |
epochs: Number of training epochs | |
Returns: | |
threading.Event: Event to signal stopping the training | |
""" | |
# Use simulate_training instead if transformers isn't available | |
if not TRANSFORMERS_AVAILABLE: | |
add_log("No transformers library available, using simulation mode") | |
return simulate_training(model_id, dataset_name, base_model_name, epochs) | |
# Create training arguments | |
training_args = TrainingArguments( | |
learning_rate=learning_rate, | |
batch_size=batch_size, | |
num_train_epochs=epochs | |
) | |
# Determine device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
add_log(f"Using device: {device}") | |
# Initialize training progress | |
initialize_training_progress(model_id) | |
# Create stop event | |
stop_event = threading.Event() | |
# Start training thread | |
training_thread = threading.Thread( | |
target=train_model_thread, | |
args=(model_id, dataset_name, base_model_name, training_args, device, stop_event) | |
) | |
training_thread.start() | |
return stop_event | |
def stop_model_training(model_id, stop_event): | |
""" | |
Stop model training. | |
Args: | |
model_id: Identifier for the model | |
stop_event: Threading event to signal stopping | |
""" | |
if stop_event.is_set(): | |
return | |
add_log(f"Stopping training for model {model_id}") | |
stop_event.set() | |
# Update training progress | |
if 'training_progress' in st.session_state and model_id in st.session_state.training_progress: | |
progress_data = st.session_state.training_progress[model_id] | |
if progress_data['status'] == 'running': | |
progress_data['status'] = 'stopped' | |
progress_data['completed_at'] = timestamp() | |
def get_running_training_jobs(): | |
""" | |
Get list of currently running training jobs. | |
Returns: | |
list: List of model IDs with running training jobs | |
""" | |
running_jobs = [] | |
if 'training_progress' in st.session_state: | |
for model_id, progress in st.session_state.training_progress.items(): | |
if progress['status'] == 'running': | |
running_jobs.append(model_id) | |
return running_jobs | |
# For demo purposes - Simulate training progress without actual model training | |
def simulate_training_thread(model_id, dataset_name, base_model_name, epochs, stop_event): | |
""" | |
Simulate training progress for demonstration purposes. | |
Args: | |
model_id: Identifier for the model | |
dataset_name: Name of the dataset to use | |
base_model_name: Base model from Hugging Face | |
epochs: Number of training epochs | |
stop_event: Threading event to signal stopping | |
""" | |
add_log(f"Starting simulated training for model {model_id}") | |
update_training_progress(model_id, status='running', total_epochs=epochs) | |
for epoch in range(1, epochs + 1): | |
if stop_event.is_set(): | |
add_log(f"Simulated training for model {model_id} was manually stopped") | |
update_training_progress(model_id, status='stopped') | |
return | |
# Simulate epoch time | |
time.sleep(2) | |
# Generate random loss that decreases over time | |
loss = max(0.1, 2.0 - (epoch / epochs) * 1.5 + random.uniform(-0.1, 0.1)) | |
# Update progress | |
update_training_progress( | |
model_id, | |
epoch=epoch, | |
loss=loss, | |
progress=(epoch / epochs) * 100 | |
) | |
add_log(f"Epoch {epoch}/{epochs} completed. Loss: {loss:.4f}") | |
# Training completed | |
add_log(f"Simulated training completed for model {model_id}") | |
update_training_progress(model_id, status='completed') | |
# Create dummy model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
model = AutoModelForCausalLM.from_pretrained(base_model_name) | |
# Save to session state | |
st.session_state.trained_models[model_id] = { | |
'model': model, | |
'tokenizer': tokenizer, | |
'info': { | |
'id': model_id, | |
'base_model': base_model_name, | |
'dataset': dataset_name, | |
'created_at': timestamp(), | |
'epochs': epochs, | |
'simulated': True | |
} | |
} | |
def simulate_training(model_id, dataset_name, base_model_name, epochs): | |
""" | |
Start simulated training in a separate thread. | |
Args: | |
model_id: Identifier for the model | |
dataset_name: Name of the dataset to use | |
base_model_name: Base model from Hugging Face | |
epochs: Number of training epochs | |
Returns: | |
threading.Event: Event to signal stopping the training | |
""" | |
# Initialize training progress | |
initialize_training_progress(model_id) | |
# Create stop event | |
stop_event = threading.Event() | |
# Start training thread | |
training_thread = threading.Thread( | |
target=simulate_training_thread, | |
args=(model_id, dataset_name, base_model_name, epochs, stop_event) | |
) | |
training_thread.start() | |
return stop_event | |