import sys import logging from pathlib import Path import os import torch from transformers import set_seed import subprocess # Set environment variables for memory optimization os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' os.environ['TOKENIZERS_PARALLELISM'] = 'false' # Import the necessary modules from your project sys.path.append("./scripts") from scripts.models.model import NarrativeClassifier from scripts.models.dataset import NarrativeDataset from scripts.config.config import TrainingConfig from scripts.data_processing.data_preparation import AdvancedNarrativeProcessor from scripts.training.trainer import NarrativeTrainer def setup_spacy(): """Ensure spaCy model is installed""" try: import spacy spacy.load("en_core_web_sm") except OSError: print("Downloading spaCy model...") subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True) def setup_logging(): """Setup logging configuration""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) return logging.getLogger(__name__) def main(): # Set up logging logger = setup_logging() logger.info("Initializing training process...") # Setup spaCy setup_spacy() # Set random seeds for reproducibility set_seed(42) torch.manual_seed(42) if torch.cuda.is_available(): torch.cuda.manual_seed_all(42) # Clear GPU cache if available if torch.cuda.is_available(): torch.cuda.empty_cache() logger.info(f"CUDA available. Using GPU: {torch.cuda.get_device_name(0)}") logger.info(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") try: # Load and process the dataset annotations_file = "./data/subtask-2-annotations.txt" raw_dir = "./data/raw" logger.info("Loading and processing dataset...") processor = AdvancedNarrativeProcessor( annotations_file=annotations_file, raw_dir=raw_dir ) processed_data = processor.load_and_process_data() # Create datasets train_dataset = NarrativeDataset(processed_data['train']) val_dataset = NarrativeDataset(processed_data['val']) logger.info(f"Loaded dataset with {len(train_dataset)} training samples and {len(val_dataset)} validation samples.") # Initialize model logger.info("Initializing the model...") model = NarrativeClassifier( num_labels=train_dataset.get_num_labels(), model_name="microsoft/deberta-v3-large" ) # Define optimized training configuration config = TrainingConfig( output_dir=Path("./output"), num_epochs=5, batch_size=4, # Reduced batch size for memory learning_rate=2e-5, warmup_ratio=0.1, weight_decay=0.01, max_grad_norm=1.0, eval_steps=50, save_steps=50, fp16=True, # Enable mixed precision gradient_accumulation_steps=4, # Gradient accumulation max_length=256 # Reduced sequence length ) logger.info("Training configuration:") for key, value in vars(config).items(): logger.info(f" {key}: {value}") # Initialize trainer trainer = NarrativeTrainer( model=model, train_dataset=train_dataset, val_dataset=val_dataset, config=config ) # Start training logger.info("Starting the training process...") history = trainer.train() # Log final metrics logger.info("Training completed successfully!") logger.info("Final metrics:") logger.info(f" Best validation F1: {trainer.best_val_f1:.4f}") logger.info(f" Final training loss: {history['train_loss'][-1]:.4f}") except Exception as e: logger.error(f"Training failed with error: {str(e)}") raise finally: # Clean up if torch.cuda.is_available(): torch.cuda.empty_cache() if __name__ == "__main__": main()