Spaces:

S-Dreamer
/

CodeCraftLab

Runtime error

File size: 13,087 Bytes

5dd070e

import streamlit as st
import threading
import random
import time
from datetime import datetime
from utils import add_log, timestamp

# Handle missing dependencies
try:
    import torch
    import pandas as pd
    from transformers import TrainingArguments as HFTrainingArguments
    from transformers import Trainer, AutoModelForCausalLM, AutoTokenizer
    from datasets import Dataset, DatasetDict
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    HFTrainingArguments = None
    
    # For demo purposes
    class DummyTrainer:
        def __init__(self, **kwargs):
            self.callback = type('obj', (object,), {'__init__': lambda self: None})
            
        def train(self):
            pass

def initialize_training_progress(model_id):
    """
    Initialize training progress tracking for a model.
    
    Args:
        model_id: Identifier for the model
    """
    if 'training_progress' not in st.session_state:
        st.session_state.training_progress = {}
        
    st.session_state.training_progress[model_id] = {
        'status': 'initialized',
        'current_epoch': 0,
        'total_epochs': 0,
        'loss_history': [],
        'started_at': timestamp(),
        'completed_at': None,
        'progress': 0.0
    }

def update_training_progress(model_id, epoch=None, loss=None, status=None, progress=None, total_epochs=None):
    """
    Update training progress for a model.
    
    Args:
        model_id: Identifier for the model
        epoch: Current epoch
        loss: Current loss value
        status: Training status
        progress: Progress percentage (0-100)
        total_epochs: Total number of epochs
    """
    if 'training_progress' not in st.session_state or model_id not in st.session_state.training_progress:
        initialize_training_progress(model_id)
        
    progress_data = st.session_state.training_progress[model_id]
    
    if epoch is not None:
        progress_data['current_epoch'] = epoch
    
    if loss is not None:
        progress_data['loss_history'].append(loss)
    
    if status is not None:
        progress_data['status'] = status
        if status == 'completed':
            progress_data['completed_at'] = timestamp()
            progress_data['progress'] = 100.0
    
    if progress is not None:
        progress_data['progress'] = progress
        
    if total_epochs is not None:
        progress_data['total_epochs'] = total_epochs

def tokenize_dataset(dataset, tokenizer, max_length=512):
    """
    Tokenize a dataset for model training.
    
    Args:
        dataset: The dataset to tokenize
        tokenizer: The tokenizer to use
        max_length: Maximum sequence length
        
    Returns:
        Dataset: Tokenized dataset
    """
    def tokenize_function(examples):
        return tokenizer(examples['code'], padding='max_length', truncation=True, max_length=max_length)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

def train_model_thread(model_id, dataset_name, base_model_name, training_args, device, stop_event):
    """
    Thread function for training a model.
    
    Args:
        model_id: Identifier for the model
        dataset_name: Name of the dataset to use
        base_model_name: Base model from Hugging Face
        training_args: Training arguments
        device: Device to use for training (cpu/cuda)
        stop_event: Threading event to signal stopping
    """
    try:
        # Get dataset
        dataset = st.session_state.datasets[dataset_name]['data']
        
        # Initialize model and tokenizer
        add_log(f"Initializing model {base_model_name} for training")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        model = AutoModelForCausalLM.from_pretrained(base_model_name)
        
        # Check if tokenizer has padding token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = model.config.eos_token_id
        
        # Tokenize dataset
        add_log(f"Tokenizing dataset {dataset_name}")
        train_dataset = tokenize_dataset(dataset['train'], tokenizer)
        val_dataset = tokenize_dataset(dataset['validation'], tokenizer)
        
        # Update training progress
        update_training_progress(
            model_id, 
            status='running',
            total_epochs=training_args.num_train_epochs
        )
        
        # Define custom callback to track progress
        class CustomCallback(Trainer.callback):
            def on_epoch_end(self, args, state, control, **kwargs):
                current_epoch = state.epoch
                epoch_loss = state.log_history[-1].get('loss', 0)
                update_training_progress(
                    model_id, 
                    epoch=current_epoch, 
                    loss=epoch_loss,
                    progress=(current_epoch / training_args.num_train_epochs) * 100
                )
                add_log(f"Epoch {current_epoch}/{training_args.num_train_epochs} completed. Loss: {epoch_loss:.4f}")
                
                # Check if training should be stopped
                if stop_event.is_set():
                    add_log(f"Training for model {model_id} was manually stopped")
                    control.should_training_stop = True
        
        # Configure training arguments
        args = HFTrainingArguments(
            output_dir=f"./results/{model_id}",
            evaluation_strategy="epoch",
            learning_rate=training_args.learning_rate,
            per_device_train_batch_size=training_args.batch_size,
            per_device_eval_batch_size=training_args.batch_size,
            num_train_epochs=training_args.num_train_epochs,
            weight_decay=0.01,
            save_total_limit=1,
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            callbacks=[CustomCallback]
        )
        
        # Train the model
        add_log(f"Starting training for model {model_id}")
        trainer.train()
        
        # Save the model
        if not stop_event.is_set():
            add_log(f"Training completed for model {model_id}")
            update_training_progress(model_id, status='completed')
            
            # Save to session state
            st.session_state.trained_models[model_id] = {
                'model': model,
                'tokenizer': tokenizer,
                'info': {
                    'id': model_id,
                    'base_model': base_model_name,
                    'dataset': dataset_name,
                    'created_at': timestamp(),
                    'epochs': training_args.num_train_epochs,
                    'learning_rate': training_args.learning_rate,
                    'batch_size': training_args.batch_size
                }
            }
        
    except Exception as e:
        add_log(f"Error during training model {model_id}: {str(e)}", "ERROR")
        update_training_progress(model_id, status='failed')

class TrainingArguments:
    def __init__(self, learning_rate, batch_size, num_train_epochs):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_train_epochs = num_train_epochs

def start_model_training(model_id, dataset_name, base_model_name, learning_rate, batch_size, epochs):
    """
    Start model training in a separate thread.
    
    Args:
        model_id: Identifier for the model
        dataset_name: Name of the dataset to use
        base_model_name: Base model from Hugging Face
        learning_rate: Learning rate for training
        batch_size: Batch size for training
        epochs: Number of training epochs
        
    Returns:
        threading.Event: Event to signal stopping the training
    """
    # Use simulate_training instead if transformers isn't available
    if not TRANSFORMERS_AVAILABLE:
        add_log("No transformers library available, using simulation mode")
        return simulate_training(model_id, dataset_name, base_model_name, epochs)
        
    # Create training arguments
    training_args = TrainingArguments(
        learning_rate=learning_rate,
        batch_size=batch_size,
        num_train_epochs=epochs
    )
    
    # Determine device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    add_log(f"Using device: {device}")
    
    # Initialize training progress
    initialize_training_progress(model_id)
    
    # Create stop event
    stop_event = threading.Event()
    
    # Start training thread
    training_thread = threading.Thread(
        target=train_model_thread,
        args=(model_id, dataset_name, base_model_name, training_args, device, stop_event)
    )
    training_thread.start()
    
    return stop_event

def stop_model_training(model_id, stop_event):
    """
    Stop model training.
    
    Args:
        model_id: Identifier for the model
        stop_event: Threading event to signal stopping
    """
    if stop_event.is_set():
        return
        
    add_log(f"Stopping training for model {model_id}")
    stop_event.set()
    
    # Update training progress
    if 'training_progress' in st.session_state and model_id in st.session_state.training_progress:
        progress_data = st.session_state.training_progress[model_id]
        if progress_data['status'] == 'running':
            progress_data['status'] = 'stopped'
            progress_data['completed_at'] = timestamp()

def get_running_training_jobs():
    """
    Get list of currently running training jobs.
    
    Returns:
        list: List of model IDs with running training jobs
    """
    running_jobs = []
    
    if 'training_progress' in st.session_state:
        for model_id, progress in st.session_state.training_progress.items():
            if progress['status'] == 'running':
                running_jobs.append(model_id)
                
    return running_jobs

# For demo purposes - Simulate training progress without actual model training
def simulate_training_thread(model_id, dataset_name, base_model_name, epochs, stop_event):
    """
    Simulate training progress for demonstration purposes.
    
    Args:
        model_id: Identifier for the model
        dataset_name: Name of the dataset to use
        base_model_name: Base model from Hugging Face
        epochs: Number of training epochs
        stop_event: Threading event to signal stopping
    """
    add_log(f"Starting simulated training for model {model_id}")
    update_training_progress(model_id, status='running', total_epochs=epochs)
    
    for epoch in range(1, epochs + 1):
        if stop_event.is_set():
            add_log(f"Simulated training for model {model_id} was manually stopped")
            update_training_progress(model_id, status='stopped')
            return
            
        # Simulate epoch time
        time.sleep(2)
        
        # Generate random loss that decreases over time
        loss = max(0.1, 2.0 - (epoch / epochs) * 1.5 + random.uniform(-0.1, 0.1))
        
        # Update progress
        update_training_progress(
            model_id,
            epoch=epoch,
            loss=loss,
            progress=(epoch / epochs) * 100
        )
        
        add_log(f"Epoch {epoch}/{epochs} completed. Loss: {loss:.4f}")
    
    # Training completed
    add_log(f"Simulated training completed for model {model_id}")
    update_training_progress(model_id, status='completed')
    
    # Create dummy model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    model = AutoModelForCausalLM.from_pretrained(base_model_name)
    
    # Save to session state
    st.session_state.trained_models[model_id] = {
        'model': model,
        'tokenizer': tokenizer,
        'info': {
            'id': model_id,
            'base_model': base_model_name,
            'dataset': dataset_name,
            'created_at': timestamp(),
            'epochs': epochs,
            'simulated': True
        }
    }

def simulate_training(model_id, dataset_name, base_model_name, epochs):
    """
    Start simulated training in a separate thread.
    
    Args:
        model_id: Identifier for the model
        dataset_name: Name of the dataset to use
        base_model_name: Base model from Hugging Face
        epochs: Number of training epochs
        
    Returns:
        threading.Event: Event to signal stopping the training
    """
    # Initialize training progress
    initialize_training_progress(model_id)
    
    # Create stop event
    stop_event = threading.Event()
    
    # Start training thread
    training_thread = threading.Thread(
        target=simulate_training_thread,
        args=(model_id, dataset_name, base_model_name, epochs, stop_event)
    )
    training_thread.start()
    
    return stop_event