Mohammaderfan koupaei commited on
Commit
b5e09fa
·
1 Parent(s): af37d71
Files changed (1) hide show
  1. app.py +55 -42
app.py CHANGED
@@ -4,6 +4,7 @@ from pathlib import Path
4
  import os
5
  import torch
6
  from transformers import set_seed
 
7
 
8
  # Set environment variables for memory optimization
9
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
@@ -17,6 +18,15 @@ from scripts.config.config import TrainingConfig
17
  from scripts.data_processing.data_preparation import AdvancedNarrativeProcessor
18
  from scripts.training.trainer import NarrativeTrainer
19
 
 
 
 
 
 
 
 
 
 
20
  def setup_logging():
21
  """Setup logging configuration"""
22
  logging.basicConfig(
@@ -31,6 +41,9 @@ def main():
31
  logger = setup_logging()
32
  logger.info("Initializing training process...")
33
 
 
 
 
34
  # Set random seeds for reproducibility
35
  set_seed(42)
36
  torch.manual_seed(42)
@@ -43,49 +56,49 @@ def main():
43
  logger.info(f"CUDA available. Using GPU: {torch.cuda.get_device_name(0)}")
44
  logger.info(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
45
 
46
- # Load and process the dataset
47
- annotations_file = "./data/subtask-2-annotations.txt"
48
- raw_dir = "./data/raw"
49
- logger.info("Loading and processing dataset...")
50
-
51
- processor = AdvancedNarrativeProcessor(
52
- annotations_file=annotations_file,
53
- raw_dir=raw_dir
54
- )
55
- processed_data = processor.load_and_process_data()
56
-
57
- # Create datasets
58
- train_dataset = NarrativeDataset(processed_data['train'])
59
- val_dataset = NarrativeDataset(processed_data['val'])
60
- logger.info(f"Loaded dataset with {len(train_dataset)} training samples and {len(val_dataset)} validation samples.")
61
-
62
- # Initialize model
63
- logger.info("Initializing the model...")
64
- model = NarrativeClassifier(
65
- num_labels=train_dataset.get_num_labels(),
66
- model_name="microsoft/deberta-v3-large"
67
- )
68
-
69
- # Define optimized training configuration
70
- config = TrainingConfig(
71
- output_dir=Path("./output"),
72
- num_epochs=5,
73
- batch_size=4, # Reduced batch size for memory
74
- learning_rate=2e-5,
75
- warmup_ratio=0.1,
76
- weight_decay=0.01,
77
- max_grad_norm=1.0,
78
- eval_steps=50,
79
- save_steps=50,
80
- fp16=True, # Enable mixed precision
81
- gradient_accumulation_steps=4, # Gradient accumulation
82
- max_length=256 # Reduced sequence length
83
- )
84
- logger.info("Training configuration:")
85
- for key, value in vars(config).items():
86
- logger.info(f" {key}: {value}")
87
-
88
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  # Initialize trainer
90
  trainer = NarrativeTrainer(
91
  model=model,
 
4
  import os
5
  import torch
6
  from transformers import set_seed
7
+ import subprocess
8
 
9
  # Set environment variables for memory optimization
10
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
 
18
  from scripts.data_processing.data_preparation import AdvancedNarrativeProcessor
19
  from scripts.training.trainer import NarrativeTrainer
20
 
21
+ def setup_spacy():
22
+ """Ensure spaCy model is installed"""
23
+ try:
24
+ import spacy
25
+ spacy.load("en_core_web_sm")
26
+ except OSError:
27
+ print("Downloading spaCy model...")
28
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
29
+
30
  def setup_logging():
31
  """Setup logging configuration"""
32
  logging.basicConfig(
 
41
  logger = setup_logging()
42
  logger.info("Initializing training process...")
43
 
44
+ # Setup spaCy
45
+ setup_spacy()
46
+
47
  # Set random seeds for reproducibility
48
  set_seed(42)
49
  torch.manual_seed(42)
 
56
  logger.info(f"CUDA available. Using GPU: {torch.cuda.get_device_name(0)}")
57
  logger.info(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  try:
60
+ # Load and process the dataset
61
+ annotations_file = "./data/subtask-2-annotations.txt"
62
+ raw_dir = "./data/raw"
63
+ logger.info("Loading and processing dataset...")
64
+
65
+ processor = AdvancedNarrativeProcessor(
66
+ annotations_file=annotations_file,
67
+ raw_dir=raw_dir
68
+ )
69
+ processed_data = processor.load_and_process_data()
70
+
71
+ # Create datasets
72
+ train_dataset = NarrativeDataset(processed_data['train'])
73
+ val_dataset = NarrativeDataset(processed_data['val'])
74
+ logger.info(f"Loaded dataset with {len(train_dataset)} training samples and {len(val_dataset)} validation samples.")
75
+
76
+ # Initialize model
77
+ logger.info("Initializing the model...")
78
+ model = NarrativeClassifier(
79
+ num_labels=train_dataset.get_num_labels(),
80
+ model_name="microsoft/deberta-v3-large"
81
+ )
82
+
83
+ # Define optimized training configuration
84
+ config = TrainingConfig(
85
+ output_dir=Path("./output"),
86
+ num_epochs=5,
87
+ batch_size=4, # Reduced batch size for memory
88
+ learning_rate=2e-5,
89
+ warmup_ratio=0.1,
90
+ weight_decay=0.01,
91
+ max_grad_norm=1.0,
92
+ eval_steps=50,
93
+ save_steps=50,
94
+ fp16=True, # Enable mixed precision
95
+ gradient_accumulation_steps=4, # Gradient accumulation
96
+ max_length=256 # Reduced sequence length
97
+ )
98
+ logger.info("Training configuration:")
99
+ for key, value in vars(config).items():
100
+ logger.info(f" {key}: {value}")
101
+
102
  # Initialize trainer
103
  trainer = NarrativeTrainer(
104
  model=model,