newsemeval / scripts /data_processing /data_preparation.py
Mohammaderfan koupaei
Add application file
6b418f0
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
import torch
from transformers import AutoTokenizer
import logging
from tqdm import tqdm
class AdvancedNarrativeProcessor:
def __init__(self, annotations_file: str, raw_dir: str, model_name: str = "microsoft/deberta-v3-large"):
self.setup_logging()
self.logger = logging.getLogger(__name__)
self.annotations_file = Path(annotations_file)
self.raw_dir = Path(raw_dir)
self.model_name = model_name
# Initialize tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# Initialize SpaCy
self.nlp = spacy.load("en_core_web_sm") # Download it with `python -m spacy download en_core_web_sm`
self.stopwords = spacy.lang.en.stop_words.STOP_WORDS
# Initialize state
self.df = None
self.processed_data = None
self.label_encodings = None
self.tfidf_vectorizer = None
def setup_logging(self):
"""Set up logging configuration"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def load_and_process_data(self) -> Dict:
"""Main processing pipeline"""
self.logger.info("Starting data processing pipeline...")
# 1. Load Raw Data
self.load_data()
# 2. Process Text and Labels
processed_articles = self.process_all_articles()
# 3. Engineer Features
self.add_features(processed_articles)
# 4. Create Data Splits
train_data, val_data = self.create_splits(processed_articles)
# 5. Prepare Model Inputs
train_inputs = self.prepare_model_inputs(train_data)
val_inputs = self.prepare_model_inputs(val_data)
self.logger.info("Data processing complete!")
return {
'train': train_inputs,
'val': val_inputs,
'label_encodings': self.label_encodings,
'stats': self.get_statistics()
}
def load_data(self):
"""Load and prepare the annotation data"""
self.logger.info(f"Loading annotations from {self.annotations_file}")
# Load annotations file
self.df = pd.read_csv(
self.annotations_file,
sep='\t',
names=['article_id', 'narratives', 'subnarratives']
)
# Create label encodings
all_subnarratives = set()
for subnarrs in self.df['subnarratives'].str.split(';'):
all_subnarratives.update(subnarrs)
self.label_encodings = {
label: idx for idx, label in enumerate(sorted(all_subnarratives))
}
self.logger.info(f"Loaded {len(self.df)} articles with {len(self.label_encodings)} unique labels")
def read_article(self, article_id: str) -> str:
"""Read article content from file"""
try:
with open(self.raw_dir / article_id, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
self.logger.error(f"Error reading article {article_id}: {e}")
return ""
def process_text(self, text: str) -> str:
"""Enhanced text processing"""
# Remove URLs and emails
text = re.sub(r'http\S+|www\S+|\S+@\S+', '', text)
# Normalize whitespace
text = ' '.join(text.split())
# Handle numbers and special characters
text = re.sub(r'\d+', ' NUM ', text)
text = re.sub(r'[^\w\s.,!?-]', ' ', text)
return text.strip()
def extract_features(self, text: str) -> Dict:
"""Extract rich text features using SpaCy."""
# Process text with SpaCy
doc = self.nlp(text)
words = [token.text for token in doc if not token.is_space]
sentences = list(doc.sents)
return {
'length': len(words),
'avg_word_length': np.mean([len(w) for w in words]),
'sentence_count': len(sentences),
'avg_sentence_length': len(words) / len(sentences) if sentences else 0,
'unique_words': len(set(words)),
'density': len(set(words)) / len(words) if words else 0
}
def process_all_articles(self) -> List[Dict]:
"""Process all articles with rich features"""
processed_articles = []
for _, row in tqdm(self.df.iterrows(), desc="Processing articles"):
# Read and process text
text = self.read_article(row['article_id'])
processed_text = self.process_text(text)
# Extract features
features = self.extract_features(processed_text)
# Process labels
labels = self.process_labels(row['subnarratives'])
processed_articles.append({
'id': row['article_id'],
'text': processed_text,
'features': features,
'labels': labels,
'domain': 'UA' if 'UA' in row['article_id'] else 'CC'
})
return processed_articles
def process_labels(self, subnarratives: str) -> List[int]:
"""Convert subnarratives string to label vector"""
label_vector = [0] * len(self.label_encodings)
for subnarr in subnarratives.split(';'):
if subnarr in self.label_encodings:
label_vector[self.label_encodings[subnarr]] = 1
return label_vector
def add_features(self, articles: List[Dict]):
"""Add TF-IDF and additional features"""
# Create TF-IDF features
self.tfidf_vectorizer = TfidfVectorizer(
max_features=5000,
stop_words='english'
)
texts = [article['text'] for article in articles]
tfidf_features = self.tfidf_vectorizer.fit_transform(texts)
# Add to articles
for idx, article in enumerate(articles):
article['tfidf_features'] = tfidf_features[idx]
def create_splits(self, articles: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
"""Create stratified splits"""
# Use domain and label distribution for stratification
stratify_labels = [f"{a['domain']}_{'-'.join(str(l) for l in a['labels'])}"
for a in articles]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = next(skf.split(articles, stratify_labels))
return [articles[i] for i in train_idx], [articles[i] for i in val_idx]
def prepare_model_inputs(self, articles: List[Dict]) -> Dict[str, torch.Tensor]:
"""Prepare inputs for the model"""
# Tokenize texts
encodings = self.tokenizer(
[a['text'] for a in articles],
padding=True,
truncation=True,
max_length=512,
return_tensors='pt'
)
# Convert labels to tensor
labels = torch.tensor([a['labels'] for a in articles])
# Convert features to tensor with explicit float32 dtype
features = torch.tensor([[
a['features']['length'],
a['features']['avg_word_length'],
a['features']['sentence_count'],
a['features']['avg_sentence_length'],
a['features']['density']
] for a in articles], dtype=torch.float32) # Specify float32 dtype
return {
'input_ids': encodings['input_ids'],
'attention_mask': encodings['attention_mask'],
'labels': labels,
'features': features
}
def get_label_distribution(self) -> Dict:
"""Calculate the distribution of labels in the dataset"""
if self.df is None:
return {}
label_counts = {}
for subnarrs in self.df['subnarratives'].str.split(';'):
for subnarr in subnarrs:
if subnarr in self.label_encodings:
label_counts[subnarr] = label_counts.get(subnarr, 0) + 1
return label_counts
def get_statistics(self) -> Dict:
"""Get processing statistics"""
return {
'total_articles': len(self.df),
'label_distribution': self.get_label_distribution(),
'vocabulary_size': len(self.tfidf_vectorizer.vocabulary_),
'domain_distribution': self.df['article_id'].apply(
lambda x: 'UA' if 'UA' in x else 'CC'
).value_counts().to_dict()
}
def analyze_features(self, processed_data: Dict) -> Dict:
"""Analyze feature statistics from processed data"""
train_features = processed_data['train']['features']
feature_names = ['length', 'avg_word_length', 'sentence_count',
'avg_sentence_length', 'density']
feature_stats = {}
for i, name in enumerate(feature_names):
values = train_features[:, i]
feature_stats[name] = {
'mean': float(values.mean()),
'std': float(values.std()),
'min': float(values.min()),
'max': float(values.max())
}
return feature_stats
# Usage example
if __name__ == "__main__":
processor = AdvancedNarrativeProcessor(
annotations_file="../../data/subtask-2-annotations.txt",
raw_dir="../../data/raw"
)
processed_data = processor.load_and_process_data()
# Print statistics
stats = processed_data['stats']
print("\n=== Processing Statistics ===")
print(f"Total Articles: {stats['total_articles']}")
print(f"Vocabulary Size: {stats['vocabulary_size']}")
print("\nDomain Distribution:")
for domain, count in stats['domain_distribution'].items():
print(f"{domain}: {count} articles")
# Print feature analysis
feature_stats = processor.analyze_features(processed_data)
print("\n=== Feature Statistics ===")
for name, stats in feature_stats.items():
print(f"{name}:")
print(f" Mean: {stats['mean']:.2f}")
print(f" Std: {stats['std']:.2f}")
print(f" Range: [{stats['min']:.2f}, {stats['max']:.2f}]")