File size: 8,541 Bytes

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import os
import logging
import numpy as np
import torch
from tqdm.auto import tqdm

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('training.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Set up cache directory and token
os.environ["HF_HOME"] = "/tmp/cache"
os.makedirs("/tmp/cache", exist_ok=True)

# Get Hugging Face token securely
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
    raise ValueError("Hugging Face access token not found. Set it in the environment as 'HF_TOKEN'")

MODEL_HUB_ID = "Alaaeldin/example-model"  # Replace with your Hugging Face username
BASE_MODEL = "deepset/roberta-base-squad2"

class ModelTrainer:
    def __init__(self):
        self.metric = load_metric("squad")
        self.tokenizer = None
        self.model = None
    
    def load_tokenizer_and_model(self):
        """Load the tokenizer and model with error handling"""
        try:
            logger.info(f"Loading tokenizer and model from {BASE_MODEL}")
            self.tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
            self.model = AutoModelForQuestionAnswering.from_pretrained(BASE_MODEL)
            return True
        except Exception as e:
            logger.error(f"Error loading tokenizer and model: {e}")
            raise

    def preprocess_function(self, examples):
        """Preprocess the dataset examples"""
        try:
            tokenized_examples = self.tokenizer(
                examples["question"],
                examples["context"],
                truncation=True,
                max_length=384,
                stride=128,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                padding="max_length",
            )
            
            sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
            tokenized_examples["start_positions"] = []
            tokenized_examples["end_positions"] = []

            for i, offsets in enumerate(tokenized_examples["offset_mapping"]):
                sample_idx = sample_mapping[i]
                answers = examples["answers"][sample_idx]
                
                # Default values
                start_position = 0
                end_position = 0
                
                if len(answers["answer_start"]) > 0 and len(answers["text"]) > 0:
                    start_char = answers["answer_start"][0]
                    end_char = start_char + len(answers["text"][0])
                    
                    # Find token positions
                    token_start_index = 0
                    token_end_index = len(offsets) - 1

                    # Find start position
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    token_start_index -= 1

                    # Find end position
                    while token_end_index > 0 and offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    token_end_index += 1

                    if 0 <= token_start_index <= token_end_index < len(offsets):
                        start_position = token_start_index
                        end_position = token_end_index

                tokenized_examples["start_positions"].append(start_position)
                tokenized_examples["end_positions"].append(end_position)

            return tokenized_examples
        except Exception as e:
            logger.error(f"Error in preprocessing: {e}")
            raise

    def compute_metrics(self, eval_pred):
        """Compute evaluation metrics"""
        predictions, labels = eval_pred
        start_logits, end_logits = predictions
        
        start_predictions = np.argmax(start_logits, axis=-1)
        end_predictions = np.argmax(end_logits, axis=-1)
        
        results = self.metric.compute(
            predictions={
                "start_positions": start_predictions,
                "end_positions": end_predictions
            },
            references={
                "start_positions": labels[0],
                "end_positions": labels[1]
            }
        )
        return results

    def validate_model_outputs(self, model, tokenizer):
        """Validate model outputs with a test example"""
        logger.info("Validating model outputs...")
        try:
            test_question = "What is the capital of France?"
            test_context = "Paris is the capital of France."
            
            inputs = tokenizer(
                test_question,
                test_context,
                return_tensors="pt",
                truncation=True,
                max_length=384,
                padding="max_length"
            )
            
            outputs = model(**inputs)
            
            if not (isinstance(outputs.start_logits, torch.Tensor) and 
                    isinstance(outputs.end_logits, torch.Tensor)):
                raise ValueError("Model outputs validation failed")
                
            logger.info("Model validation successful!")
            return True
        except Exception as e:
            logger.error(f"Model validation failed: {e}")
            raise

    def train(self):
        """Main training function"""
        try:
            logger.info("Starting training pipeline...")
            
            # Load dataset with a smaller subset
            logger.info("Loading SQuAD dataset...")
            dataset = load_dataset("squad", split={
                'train': 'train[:1000]',
                'validation': 'validation[:100]'
            })
            
            # Load tokenizer and model
            self.load_tokenizer_and_model()
            
            # Preprocess dataset
            logger.info("Preprocessing dataset...")
            tokenized_dataset = dataset.map(
                self.preprocess_function,
                batched=True,
                remove_columns=dataset["train"].column_names,
                num_proc=2  # Reduced for Spaces
            )
            
            # Set up training arguments
            output_dir = "/tmp/results"
            os.makedirs(output_dir, exist_ok=True)
            
            training_args = TrainingArguments(
                output_dir=output_dir,
                evaluation_strategy="steps",
                eval_steps=100,
                save_strategy="steps",
                save_steps=100,
                learning_rate=3e-5,
                per_device_train_batch_size=4,
                per_device_eval_batch_size=4,
                num_train_epochs=1,
                weight_decay=0.01,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                push_to_hub=True,
                hub_model_id=MODEL_HUB_ID,
                hub_token=HF_TOKEN,
                report_to=["tensorboard"],
                logging_dir="./logs",
                logging_steps=50,
                gradient_accumulation_steps=4,
                warmup_steps=100,
            )
            
            # Initialize trainer
            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["validation"],
                compute_metrics=self.compute_metrics,
            )
            
            # Train the model
            logger.info("Starting training...")
            trainer.train()
            
            # Validate model
            self.validate_model_outputs(self.model, self.tokenizer)
            
            # Save and push to hub
            logger.info("Saving and pushing model to Hugging Face Hub...")
            trainer.save_model()
            self.model.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
            self.tokenizer.push_to_hub(MODEL_HUB_ID, use_auth_token=HF_TOKEN)
            
            logger.info("Training pipeline completed successfully!")
            
        except Exception as e:
            logger.error(f"Training pipeline failed: {e}")
            raise

if __name__ == "__main__":
    trainer = ModelTrainer()
    trainer.train()