Spaces:

eerrffuunn
/

gpusemeval

Runtime error

App Files Files Community

Mohammaderfan koupaei commited on Dec 17, 2024

Commit

06ca50d

1 Parent(s): b5e09fa

second

Browse files

Files changed (2) hide show

scripts/config/config.py +10 -7
scripts/training/trainer.py +117 -55

scripts/config/config.py CHANGED Viewed

@@ -12,22 +12,25 @@ class TrainingConfig:
     # Training parameters
     num_epochs: int = 5
-    learning_rate: float = 2e-5
-    warmup_ratio: float = 0.1
     weight_decay: float = 0.01
     max_grad_norm: float = 1.0
     gradient_accumulation_steps: int = 4
-    fp16: bool = True  # Enable mixed precision training
-    max_length: int = 256  # Reduce from 512
-    batch_size: int = 4  # Reduce from 8
     # Data parameters
-    max_length: int = 512
     train_ratio: float = 0.8
     # Output parameters
     output_dir: Path = Path("outputs")
-    save_steps: int = 100
     eval_steps: int = 50
     # Device

     # Training parameters
     num_epochs: int = 5
+    learning_rate: float = 1e-5  # Reduced from 2e-5
+    warmup_ratio: float = 0.2    # Increased from 0.1
     weight_decay: float = 0.01
     max_grad_norm: float = 1.0
     gradient_accumulation_steps: int = 4
+    fp16: bool = True
     # Data parameters
+    max_length: int = 256
+    batch_size: int = 4
     train_ratio: float = 0.8
+    # Loss parameters
+    pos_weight_multiplier: float = 5.0  # Weight multiplier for positive classes
+    label_smoothing: float = 0.1        # Label smoothing factor
     # Output parameters
     output_dir: Path = Path("outputs")
+    save_steps: int = 50
     eval_steps: int = 50
     # Device

scripts/training/trainer.py CHANGED Viewed

@@ -11,7 +11,7 @@ from datetime import datetime
 from torch.cuda.amp import autocast, GradScaler
 class NarrativeTrainer:
-    """Comprehensive trainer for narrative classification with GPU memory optimizations"""
     def __init__(
         self,
         model,
@@ -60,28 +60,33 @@ class NarrativeTrainer:
         self.history = {
             'train_loss': [],
             'val_loss': [],
-            'val_f1': [],
-            'val_precision': [],
-            'val_recall': []
         }
     def setup_logging(self):
-        """Initialize logging configuration"""
         logging.basicConfig(
             level=logging.INFO,
             format='%(asctime)s - %(levelname)s - %(message)s',
             datefmt='%Y-%m-%d %H:%M:%S'
         )
     def setup_training(self):
-        """Initialize training components with memory optimizations"""
         # Create dataloaders
         self.train_loader = DataLoader(
             self.train_dataset,
             batch_size=self.config.batch_size,
             shuffle=True,
             num_workers=4,
-            pin_memory=True  # Optimize data transfer to GPU
         )
         self.val_loader = DataLoader(
@@ -91,6 +96,15 @@ class NarrativeTrainer:
             pin_memory=True
         )
         # Setup optimizer
         self.optimizer = torch.optim.AdamW(
             self.model.parameters(),
@@ -98,7 +112,7 @@ class NarrativeTrainer:
             weight_decay=self.config.weight_decay
         )
-        # Setup scheduler with gradient accumulation steps
         num_update_steps_per_epoch = len(self.train_loader) // self.config.gradient_accumulation_steps
         num_training_steps = num_update_steps_per_epoch * self.config.num_epochs
         num_warmup_steps = int(num_training_steps * self.config.warmup_ratio)
@@ -109,17 +123,77 @@ class NarrativeTrainer:
             num_training_steps=num_training_steps
         )
-        self.criterion = torch.nn.BCEWithLogitsLoss()
     def save_config(self):
-        """Save training configuration"""
         config_dict = {k: str(v) for k, v in vars(self.config).items()}
         config_path = self.output_dir / 'config.json'
         with open(config_path, 'w') as f:
             json.dump(config_dict, f, indent=4)
     def train_epoch(self):
-        """Train for one epoch with memory optimizations"""
         self.model.train()
         total_loss = 0
         self.optimizer.zero_grad()
@@ -129,10 +203,8 @@ class NarrativeTrainer:
                    desc=f'Epoch {self.current_epoch + 1}/{self.config.num_epochs}')
         for step, batch in pbar:
-            # Move batch to device
             batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
-            # Mixed precision forward pass
             with autocast(enabled=self.config.fp16):
                 outputs = self.model(
                     input_ids=batch['input_ids'],
@@ -142,10 +214,8 @@ class NarrativeTrainer:
                 loss = self.criterion(outputs, batch['labels'])
                 loss = loss / self.config.gradient_accumulation_steps
-            # Scaled backward pass
             self.scaler.scale(loss).backward()
-            # Update weights if we've accumulated enough gradients
             if (step + 1) % self.config.gradient_accumulation_steps == 0:
                 self.scaler.unscale_(self.optimizer)
                 torch.nn.utils.clip_grad_norm_(
@@ -158,33 +228,29 @@ class NarrativeTrainer:
                 self.scheduler.step()
                 self.optimizer.zero_grad()
-            # Update metrics
             total_loss += loss.item() * self.config.gradient_accumulation_steps
             avg_loss = total_loss / (step + 1)
             pbar.set_postfix({'loss': f'{avg_loss:.4f}'})
             self.global_step += 1
-            # Evaluate if needed
             if self.global_step % self.config.eval_steps == 0:
                 self.evaluate()
-            # Clear memory periodically
             if step % 10 == 0:
                 torch.cuda.empty_cache()
-            # Clear unnecessary tensors
             del outputs
             del loss
         return total_loss / len(self.train_loader)
     @torch.no_grad()
     def evaluate(self):
-        """Evaluate model with memory optimizations"""
         self.model.eval()
         total_loss = 0
-        all_preds, all_labels = [], []
         for batch in tqdm(self.val_loader, desc="Evaluating"):
             batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
@@ -198,40 +264,38 @@ class NarrativeTrainer:
                 loss = self.criterion(outputs, batch['labels'])
             total_loss += loss.item()
-            # CPU computations for predictions
-            preds = (torch.sigmoid(outputs) > 0.5).cpu().numpy()
-            labels = batch['labels'].cpu().numpy()
-            all_preds.append(preds)
-            all_labels.append(labels)
-            # Clear memory
             del outputs
             del loss
             torch.cuda.empty_cache()
-        # Compute metrics
-        all_preds = np.concatenate(all_preds, axis=0)
-        all_labels = np.concatenate(all_labels, axis=0)
-        metrics = {
-            'loss': total_loss / len(self.val_loader),
-            'f1': f1_score(all_labels, all_preds, average='micro'),
-            'precision': precision_score(all_labels, all_preds, average='micro'),
-            'recall': recall_score(all_labels, all_preds, average='micro')
-        }
-        self.logger.info(f"Step {self.global_step} - Validation metrics: {metrics}")
-        if metrics['f1'] > self.best_val_f1:
-            self.best_val_f1 = metrics['f1']
             self.save_model('best_model.pt', metrics)
         return metrics
     def save_model(self, filename: str, metrics: dict = None):
-        """Save model checkpoint"""
         save_path = self.output_dir / filename
         torch.save({
             'model_state_dict': self.model.state_dict(),
@@ -241,10 +305,11 @@ class NarrativeTrainer:
             'epoch': self.current_epoch,
             'global_step': self.global_step,
             'best_val_f1': self.best_val_f1,
-            'metrics': metrics
         }, save_path)
         self.logger.info(f"Model saved to {save_path}")
     def train(self):
         """Run complete training loop"""
         self.logger.info("Starting training...")
@@ -257,14 +322,11 @@ class NarrativeTrainer:
                 self.history['train_loss'].append(train_loss)
                 val_metrics = self.evaluate()
-                self.history['val_loss'].append(val_metrics['loss'])
-                self.history['val_f1'].append(val_metrics['f1'])
-                self.history['val_precision'].append(val_metrics['precision'])
-                self.history['val_recall'].append(val_metrics['recall'])
                 self.save_model(f'checkpoint_epoch_{epoch+1}.pt', val_metrics)
-                # Save training history
                 history_path = self.output_dir / 'history.json'
                 with open(history_path, 'w') as f:
                     json.dump(self.history, f, indent=4)

 from torch.cuda.amp import autocast, GradScaler
 class NarrativeTrainer:
+    """Enhanced trainer with detailed metrics and optimizations"""
     def __init__(
         self,
         model,
         self.history = {
             'train_loss': [],
             'val_loss': [],
+            'metrics': [],
+            'thresholds': []
         }
     def setup_logging(self):
         logging.basicConfig(
             level=logging.INFO,
             format='%(asctime)s - %(levelname)s - %(message)s',
             datefmt='%Y-%m-%d %H:%M:%S'
         )
+    def calculate_class_weights(self):
+        """Calculate weights for imbalanced classes"""
+        pos_counts = self.train_dataset.labels.sum(dim=0)
+        neg_counts = len(self.train_dataset) - pos_counts
+        pos_weight = (neg_counts / pos_counts) * self.config.pos_weight_multiplier
+        return torch.clamp(pos_weight, min=1.0, max=50.0).to(self.device)
     def setup_training(self):
+        """Initialize training components with optimizations"""
         # Create dataloaders
         self.train_loader = DataLoader(
             self.train_dataset,
             batch_size=self.config.batch_size,
             shuffle=True,
             num_workers=4,
+            pin_memory=True
         )
         self.val_loader = DataLoader(
             pin_memory=True
         )
+        # Calculate class weights
+        pos_weight = self.calculate_class_weights()
+        # Setup loss function with class weights
+        self.criterion = torch.nn.BCEWithLogitsLoss(
+            pos_weight=pos_weight,
+            label_smoothing=self.config.label_smoothing
+        )
         # Setup optimizer
         self.optimizer = torch.optim.AdamW(
             self.model.parameters(),
             weight_decay=self.config.weight_decay
         )
+        # Setup scheduler
         num_update_steps_per_epoch = len(self.train_loader) // self.config.gradient_accumulation_steps
         num_training_steps = num_update_steps_per_epoch * self.config.num_epochs
         num_warmup_steps = int(num_training_steps * self.config.warmup_ratio)
             num_training_steps=num_training_steps
         )
+        # Initialize thresholds
+        self.label_thresholds = torch.ones(self.train_dataset.get_num_labels()).to(self.device) * 0.5
     def save_config(self):
         config_dict = {k: str(v) for k, v in vars(self.config).items()}
         config_path = self.output_dir / 'config.json'
         with open(config_path, 'w') as f:
             json.dump(config_dict, f, indent=4)
+    def find_optimal_thresholds(self, val_outputs, val_labels):
+        """Find optimal threshold for each label"""
+        outputs = torch.sigmoid(val_outputs).cpu().numpy()
+        labels = val_labels.cpu().numpy()
+        thresholds = []
+        for i in range(labels.shape[1]):
+            best_f1 = 0
+            best_threshold = 0.5
+            if labels[:, i].sum() > 0:  # Only if we have positive samples
+                for threshold in np.arange(0.1, 0.9, 0.05):
+                    preds = (outputs[:, i] > threshold).astype(int)
+                    f1 = f1_score(labels[:, i], preds)
+                    if f1 > best_f1:
+                        best_f1 = f1
+                        best_threshold = threshold
+            thresholds.append(best_threshold)
+        return torch.tensor(thresholds).to(self.device)
+    def calculate_detailed_metrics(self, all_labels, all_preds, all_probs=None):
+        """Calculate detailed metrics for model evaluation"""
+        metrics = {}
+        # Basic metrics
+        metrics['micro'] = {
+            'precision': precision_score(all_labels, all_preds, average='micro'),
+            'recall': recall_score(all_labels, all_preds, average='micro'),
+            'f1': f1_score(all_labels, all_preds, average='micro')
+        }
+        metrics['macro'] = {
+            'precision': precision_score(all_labels, all_preds, average='macro'),
+            'recall': recall_score(all_labels, all_preds, average='macro'),
+            'f1': f1_score(all_labels, all_preds, average='macro')
+        }
+        metrics['weighted'] = {
+            'precision': precision_score(all_labels, all_preds, average='weighted'),
+            'recall': recall_score(all_labels, all_preds, average='weighted'),
+            'f1': f1_score(all_labels, all_preds, average='weighted')
+        }
+        # Per-class metrics
+        per_class_metrics = {}
+        precisions = precision_score(all_labels, all_preds, average=None)
+        recalls = recall_score(all_labels, all_preds, average=None)
+        f1s = f1_score(all_labels, all_preds, average=None)
+        for i in range(len(f1s)):
+            per_class_metrics[f'class_{i}'] = {
+                'precision': float(precisions[i]),
+                'recall': float(recalls[i]),
+                'f1': float(f1s[i]),
+                'support': int(all_labels[:, i].sum())
+            }
+        metrics['per_class'] = per_class_metrics
+        return metrics
     def train_epoch(self):
+        """Train for one epoch with optimizations"""
         self.model.train()
         total_loss = 0
         self.optimizer.zero_grad()
                    desc=f'Epoch {self.current_epoch + 1}/{self.config.num_epochs}')
         for step, batch in pbar:
             batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
             with autocast(enabled=self.config.fp16):
                 outputs = self.model(
                     input_ids=batch['input_ids'],
                 loss = self.criterion(outputs, batch['labels'])
                 loss = loss / self.config.gradient_accumulation_steps
             self.scaler.scale(loss).backward()
             if (step + 1) % self.config.gradient_accumulation_steps == 0:
                 self.scaler.unscale_(self.optimizer)
                 torch.nn.utils.clip_grad_norm_(
                 self.scheduler.step()
                 self.optimizer.zero_grad()
             total_loss += loss.item() * self.config.gradient_accumulation_steps
             avg_loss = total_loss / (step + 1)
             pbar.set_postfix({'loss': f'{avg_loss:.4f}'})
             self.global_step += 1
             if self.global_step % self.config.eval_steps == 0:
                 self.evaluate()
             if step % 10 == 0:
                 torch.cuda.empty_cache()
             del outputs
             del loss
         return total_loss / len(self.train_loader)
     @torch.no_grad()
     def evaluate(self):
+        """Evaluate model with detailed metrics"""
         self.model.eval()
         total_loss = 0
+        all_outputs, all_labels = [], []
         for batch in tqdm(self.val_loader, desc="Evaluating"):
             batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
                 loss = self.criterion(outputs, batch['labels'])
             total_loss += loss.item()
+            all_outputs.append(outputs.cpu())
+            all_labels.append(batch['labels'].cpu())
             del outputs
             del loss
             torch.cuda.empty_cache()
+        all_outputs = torch.cat(all_outputs, dim=0)
+        all_labels = torch.cat(all_labels, dim=0)
+        if self.global_step % (self.config.eval_steps * 2) == 0:
+            self.label_thresholds = self.find_optimal_thresholds(all_outputs, all_labels)
+        all_probs = torch.sigmoid(all_outputs).numpy()
+        all_preds = (all_probs > self.label_thresholds.cpu().unsqueeze(0).numpy())
+        all_labels = all_labels.numpy()
+        metrics = self.calculate_detailed_metrics(all_labels, all_preds, all_probs)
+        metrics['loss'] = total_loss / len(self.val_loader)
+        self.logger.info(f"Step {self.global_step} - Validation metrics:")
+        self.logger.info(f"Loss: {metrics['loss']:.4f}")
+        self.logger.info(f"Micro F1: {metrics['micro']['f1']:.4f}")
+        self.logger.info(f"Macro F1: {metrics['macro']['f1']:.4f}")
+        if metrics['micro']['f1'] > self.best_val_f1:
+            self.best_val_f1 = metrics['micro']['f1']
             self.save_model('best_model.pt', metrics)
         return metrics
     def save_model(self, filename: str, metrics: dict = None):
         save_path = self.output_dir / filename
         torch.save({
             'model_state_dict': self.model.state_dict(),
             'epoch': self.current_epoch,
             'global_step': self.global_step,
             'best_val_f1': self.best_val_f1,
+            'metrics': metrics,
+            'thresholds': self.label_thresholds
         }, save_path)
         self.logger.info(f"Model saved to {save_path}")
     def train(self):
         """Run complete training loop"""
         self.logger.info("Starting training...")
                 self.history['train_loss'].append(train_loss)
                 val_metrics = self.evaluate()
+                self.history['metrics'].append(val_metrics)
+                self.history['thresholds'].append(self.label_thresholds.cpu().tolist())
                 self.save_model(f'checkpoint_epoch_{epoch+1}.pt', val_metrics)
                 history_path = self.output_dir / 'history.json'
                 with open(history_path, 'w') as f:
                     json.dump(self.history, f, indent=4)