File size: 4,344 Bytes
3ab6d8e
 
 
937a410
 
3ab6d8e
b5e09fa
3ab6d8e
937a410
 
 
 
3ab6d8e
937a410
3ab6d8e
 
 
af37d71
3ab6d8e
 
b5e09fa
 
 
 
 
 
 
 
 
937a410
 
 
 
 
 
 
 
 
3ab6d8e
 
937a410
3ab6d8e
 
b5e09fa
 
 
937a410
3ab6d8e
937a410
 
 
 
 
 
 
 
 
3ab6d8e
937a410
b5e09fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937a410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ab6d8e
 
937a410
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
import logging
from pathlib import Path
import os
import torch
from transformers import set_seed
import subprocess

# Set environment variables for memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Import the necessary modules from your project
sys.path.append("./scripts")
from scripts.models.model import NarrativeClassifier
from scripts.models.dataset import NarrativeDataset
from scripts.config.config import TrainingConfig
from scripts.data_processing.data_preparation import AdvancedNarrativeProcessor
from scripts.training.trainer import NarrativeTrainer

def setup_spacy():
    """Ensure spaCy model is installed"""
    try:
        import spacy
        spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading spaCy model...")
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)

def setup_logging():
    """Setup logging configuration"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    return logging.getLogger(__name__)

def main():
    # Set up logging
    logger = setup_logging()
    logger.info("Initializing training process...")
    
    # Setup spaCy
    setup_spacy()
    
    # Set random seeds for reproducibility
    set_seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    
    # Clear GPU cache if available
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        logger.info(f"CUDA available. Using GPU: {torch.cuda.get_device_name(0)}")
        logger.info(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    
    try:
        # Load and process the dataset
        annotations_file = "./data/subtask-2-annotations.txt"
        raw_dir = "./data/raw"
        logger.info("Loading and processing dataset...")
        
        processor = AdvancedNarrativeProcessor(
            annotations_file=annotations_file,
            raw_dir=raw_dir
        )
        processed_data = processor.load_and_process_data()
        
        # Create datasets
        train_dataset = NarrativeDataset(processed_data['train'])
        val_dataset = NarrativeDataset(processed_data['val'])
        logger.info(f"Loaded dataset with {len(train_dataset)} training samples and {len(val_dataset)} validation samples.")
        
        # Initialize model
        logger.info("Initializing the model...")
        model = NarrativeClassifier(
            num_labels=train_dataset.get_num_labels(),
            model_name="microsoft/deberta-v3-large"
        )
        
        # Define optimized training configuration
        config = TrainingConfig(
            output_dir=Path("./output"),
            num_epochs=5,
            batch_size=4,  # Reduced batch size for memory
            learning_rate=2e-5,
            warmup_ratio=0.1,
            weight_decay=0.01,
            max_grad_norm=1.0,
            eval_steps=50,
            save_steps=50,
            fp16=True,  # Enable mixed precision
            gradient_accumulation_steps=4,  # Gradient accumulation
            max_length=256  # Reduced sequence length
        )
        logger.info("Training configuration:")
        for key, value in vars(config).items():
            logger.info(f"  {key}: {value}")
        
        # Initialize trainer
        trainer = NarrativeTrainer(
            model=model,
            train_dataset=train_dataset,
            val_dataset=val_dataset,
            config=config
        )
        
        # Start training
        logger.info("Starting the training process...")
        history = trainer.train()
        
        # Log final metrics
        logger.info("Training completed successfully!")
        logger.info("Final metrics:")
        logger.info(f"  Best validation F1: {trainer.best_val_f1:.4f}")
        logger.info(f"  Final training loss: {history['train_loss'][-1]:.4f}")
        
    except Exception as e:
        logger.error(f"Training failed with error: {str(e)}")
        raise
    finally:
        # Clean up
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

if __name__ == "__main__":
    main()