File size: 3,000 Bytes
fb2cd67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File: scripts/models/dataset.py

from torch.utils.data import Dataset
import torch
from typing import Dict, Any
from pathlib import Path

class NarrativeDataset(Dataset):
    """
    Dataset class for narrative classification.
    Handles the data after preprocessing for model training.
    """
    def __init__(self, data_dict: Dict[str, Any]):
        """
        Initialize the dataset with processed data.
        
        Args:
            data_dict: Dictionary containing processed data from AdvancedNarrativeProcessor
        """
        self.input_ids = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        # Convert labels and features to float
        self.labels = data_dict['labels'].float()
        self.features = data_dict['features'].float()
        
        # Verify data consistency
        assert len(self.input_ids) == len(self.labels), \
            "Mismatch between inputs and labels length"
        
    def __len__(self) -> int:
        """Return the total number of samples."""
        return len(self.input_ids)
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        """
        Get a single sample from the dataset.
        
        Args:
            idx: Index of the sample to get
            
        Returns:
            Dictionary containing all features for the sample
        """
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
            'features': self.features[idx]
        }
    
    def get_num_labels(self) -> int:
        """Return the number of labels in the dataset."""
        return self.labels.shape[1]

# Real test with our preprocessed data
if __name__ == "__main__":
    # Import our preprocessor
    import sys
    sys.path.append("../../")  # Add root to path
    from scripts.data_processing.data_preparation import AdvancedNarrativeProcessor
    
    # Initialize preprocessor
    processor = AdvancedNarrativeProcessor(
        annotations_file="../../data/subtask-2-annotations.txt",
        raw_dir="../../data/raw"
    )
    
    # Get processed data
    processed_data = processor.load_and_process_data()
    
    # Create train and validation datasets
    train_dataset = NarrativeDataset(processed_data['train'])
    val_dataset = NarrativeDataset(processed_data['val'])
    
    # Print information about the datasets
    print("\n=== Dataset Statistics ===")
    print(f"Training samples: {len(train_dataset)}")
    print(f"Validation samples: {len(val_dataset)}")
    print(f"Number of labels: {train_dataset.get_num_labels()}")
    
    # Look at a sample
    sample = train_dataset[0]
    print("\n=== Sample Details ===")
    print(f"Input IDs shape: {sample['input_ids'].shape}")
    print(f"Attention mask shape: {sample['attention_mask'].shape}")
    print(f"Labels shape: {sample['labels'].shape}")
    print(f"Features shape: {sample['features'].shape}")