import streamlit as st import threading import random import time from datetime import datetime from utils import add_log, timestamp # Handle missing dependencies try: import torch import pandas as pd from transformers import TrainingArguments as HFTrainingArguments from transformers import Trainer, AutoModelForCausalLM, AutoTokenizer from datasets import Dataset, DatasetDict TRANSFORMERS_AVAILABLE = True except ImportError: TRANSFORMERS_AVAILABLE = False HFTrainingArguments = None # For demo purposes class DummyTrainer: def __init__(self, **kwargs): self.callback = type('obj', (object,), {'__init__': lambda self: None}) def train(self): pass def initialize_training_progress(model_id): """ Initialize training progress tracking for a model. Args: model_id: Identifier for the model """ if 'training_progress' not in st.session_state: st.session_state.training_progress = {} st.session_state.training_progress[model_id] = { 'status': 'initialized', 'current_epoch': 0, 'total_epochs': 0, 'loss_history': [], 'started_at': timestamp(), 'completed_at': None, 'progress': 0.0 } def update_training_progress(model_id, epoch=None, loss=None, status=None, progress=None, total_epochs=None): """ Update training progress for a model. Args: model_id: Identifier for the model epoch: Current epoch loss: Current loss value status: Training status progress: Progress percentage (0-100) total_epochs: Total number of epochs """ if 'training_progress' not in st.session_state or model_id not in st.session_state.training_progress: initialize_training_progress(model_id) progress_data = st.session_state.training_progress[model_id] if epoch is not None: progress_data['current_epoch'] = epoch if loss is not None: progress_data['loss_history'].append(loss) if status is not None: progress_data['status'] = status if status == 'completed': progress_data['completed_at'] = timestamp() progress_data['progress'] = 100.0 if progress is not None: progress_data['progress'] = progress if total_epochs is not None: progress_data['total_epochs'] = total_epochs def tokenize_dataset(dataset, tokenizer, max_length=512): """ Tokenize a dataset for model training. Args: dataset: The dataset to tokenize tokenizer: The tokenizer to use max_length: Maximum sequence length Returns: Dataset: Tokenized dataset """ def tokenize_function(examples): return tokenizer(examples['code'], padding='max_length', truncation=True, max_length=max_length) tokenized_dataset = dataset.map(tokenize_function, batched=True) return tokenized_dataset def train_model_thread(model_id, dataset_name, base_model_name, training_args, device, stop_event): """ Thread function for training a model. Args: model_id: Identifier for the model dataset_name: Name of the dataset to use base_model_name: Base model from Hugging Face training_args: Training arguments device: Device to use for training (cpu/cuda) stop_event: Threading event to signal stopping """ try: # Get dataset dataset = st.session_state.datasets[dataset_name]['data'] # Initialize model and tokenizer add_log(f"Initializing model {base_model_name} for training") tokenizer = AutoTokenizer.from_pretrained(base_model_name) model = AutoModelForCausalLM.from_pretrained(base_model_name) # Check if tokenizer has padding token if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.config.pad_token_id = model.config.eos_token_id # Tokenize dataset add_log(f"Tokenizing dataset {dataset_name}") train_dataset = tokenize_dataset(dataset['train'], tokenizer) val_dataset = tokenize_dataset(dataset['validation'], tokenizer) # Update training progress update_training_progress( model_id, status='running', total_epochs=training_args.num_train_epochs ) # Define custom callback to track progress class CustomCallback(Trainer.callback): def on_epoch_end(self, args, state, control, **kwargs): current_epoch = state.epoch epoch_loss = state.log_history[-1].get('loss', 0) update_training_progress( model_id, epoch=current_epoch, loss=epoch_loss, progress=(current_epoch / training_args.num_train_epochs) * 100 ) add_log(f"Epoch {current_epoch}/{training_args.num_train_epochs} completed. Loss: {epoch_loss:.4f}") # Check if training should be stopped if stop_event.is_set(): add_log(f"Training for model {model_id} was manually stopped") control.should_training_stop = True # Configure training arguments args = HFTrainingArguments( output_dir=f"./results/{model_id}", evaluation_strategy="epoch", learning_rate=training_args.learning_rate, per_device_train_batch_size=training_args.batch_size, per_device_eval_batch_size=training_args.batch_size, num_train_epochs=training_args.num_train_epochs, weight_decay=0.01, save_total_limit=1, ) # Initialize trainer trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, callbacks=[CustomCallback] ) # Train the model add_log(f"Starting training for model {model_id}") trainer.train() # Save the model if not stop_event.is_set(): add_log(f"Training completed for model {model_id}") update_training_progress(model_id, status='completed') # Save to session state st.session_state.trained_models[model_id] = { 'model': model, 'tokenizer': tokenizer, 'info': { 'id': model_id, 'base_model': base_model_name, 'dataset': dataset_name, 'created_at': timestamp(), 'epochs': training_args.num_train_epochs, 'learning_rate': training_args.learning_rate, 'batch_size': training_args.batch_size } } except Exception as e: add_log(f"Error during training model {model_id}: {str(e)}", "ERROR") update_training_progress(model_id, status='failed') class TrainingArguments: def __init__(self, learning_rate, batch_size, num_train_epochs): self.learning_rate = learning_rate self.batch_size = batch_size self.num_train_epochs = num_train_epochs def start_model_training(model_id, dataset_name, base_model_name, learning_rate, batch_size, epochs): """ Start model training in a separate thread. Args: model_id: Identifier for the model dataset_name: Name of the dataset to use base_model_name: Base model from Hugging Face learning_rate: Learning rate for training batch_size: Batch size for training epochs: Number of training epochs Returns: threading.Event: Event to signal stopping the training """ # Use simulate_training instead if transformers isn't available if not TRANSFORMERS_AVAILABLE: add_log("No transformers library available, using simulation mode") return simulate_training(model_id, dataset_name, base_model_name, epochs) # Create training arguments training_args = TrainingArguments( learning_rate=learning_rate, batch_size=batch_size, num_train_epochs=epochs ) # Determine device device = "cuda" if torch.cuda.is_available() else "cpu" add_log(f"Using device: {device}") # Initialize training progress initialize_training_progress(model_id) # Create stop event stop_event = threading.Event() # Start training thread training_thread = threading.Thread( target=train_model_thread, args=(model_id, dataset_name, base_model_name, training_args, device, stop_event) ) training_thread.start() return stop_event def stop_model_training(model_id, stop_event): """ Stop model training. Args: model_id: Identifier for the model stop_event: Threading event to signal stopping """ if stop_event.is_set(): return add_log(f"Stopping training for model {model_id}") stop_event.set() # Update training progress if 'training_progress' in st.session_state and model_id in st.session_state.training_progress: progress_data = st.session_state.training_progress[model_id] if progress_data['status'] == 'running': progress_data['status'] = 'stopped' progress_data['completed_at'] = timestamp() def get_running_training_jobs(): """ Get list of currently running training jobs. Returns: list: List of model IDs with running training jobs """ running_jobs = [] if 'training_progress' in st.session_state: for model_id, progress in st.session_state.training_progress.items(): if progress['status'] == 'running': running_jobs.append(model_id) return running_jobs # For demo purposes - Simulate training progress without actual model training def simulate_training_thread(model_id, dataset_name, base_model_name, epochs, stop_event): """ Simulate training progress for demonstration purposes. Args: model_id: Identifier for the model dataset_name: Name of the dataset to use base_model_name: Base model from Hugging Face epochs: Number of training epochs stop_event: Threading event to signal stopping """ add_log(f"Starting simulated training for model {model_id}") update_training_progress(model_id, status='running', total_epochs=epochs) for epoch in range(1, epochs + 1): if stop_event.is_set(): add_log(f"Simulated training for model {model_id} was manually stopped") update_training_progress(model_id, status='stopped') return # Simulate epoch time time.sleep(2) # Generate random loss that decreases over time loss = max(0.1, 2.0 - (epoch / epochs) * 1.5 + random.uniform(-0.1, 0.1)) # Update progress update_training_progress( model_id, epoch=epoch, loss=loss, progress=(epoch / epochs) * 100 ) add_log(f"Epoch {epoch}/{epochs} completed. Loss: {loss:.4f}") # Training completed add_log(f"Simulated training completed for model {model_id}") update_training_progress(model_id, status='completed') # Create dummy model and tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_name) model = AutoModelForCausalLM.from_pretrained(base_model_name) # Save to session state st.session_state.trained_models[model_id] = { 'model': model, 'tokenizer': tokenizer, 'info': { 'id': model_id, 'base_model': base_model_name, 'dataset': dataset_name, 'created_at': timestamp(), 'epochs': epochs, 'simulated': True } } def simulate_training(model_id, dataset_name, base_model_name, epochs): """ Start simulated training in a separate thread. Args: model_id: Identifier for the model dataset_name: Name of the dataset to use base_model_name: Base model from Hugging Face epochs: Number of training epochs Returns: threading.Event: Event to signal stopping the training """ # Initialize training progress initialize_training_progress(model_id) # Create stop event stop_event = threading.Event() # Start training thread training_thread = threading.Thread( target=simulate_training_thread, args=(model_id, dataset_name, base_model_name, epochs, stop_event) ) training_thread.start() return stop_event