Spaces:

eerrffuunn
/

newsemeval

Runtime error

newsemeval / scripts /data_processing /data_preparation.py

Mohammaderfan koupaei

Add application file

6b418f0 7 months ago

10.6 kB

	import pandas as pd
	import numpy as np
	from pathlib import Path
	from typing import Dict, List, Tuple
	import re
	import spacy
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import StratifiedKFold
	import torch
	from transformers import AutoTokenizer
	import logging
	from tqdm import tqdm

	class AdvancedNarrativeProcessor:
	def __init__(self, annotations_file: str, raw_dir: str, model_name: str = "microsoft/deberta-v3-large"):
	self.setup_logging()
	self.logger = logging.getLogger(__name__)

	self.annotations_file = Path(annotations_file)
	self.raw_dir = Path(raw_dir)
	self.model_name = model_name

	# Initialize tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Initialize SpaCy
	self.nlp = spacy.load("en_core_web_sm") # Download it with `python -m spacy download en_core_web_sm`
	self.stopwords = spacy.lang.en.stop_words.STOP_WORDS

	# Initialize state
	self.df = None
	self.processed_data = None
	self.label_encodings = None
	self.tfidf_vectorizer = None

	def setup_logging(self):
	"""Set up logging configuration"""
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)

	def load_and_process_data(self) -> Dict:
	"""Main processing pipeline"""
	self.logger.info("Starting data processing pipeline...")

	# 1. Load Raw Data
	self.load_data()

	# 2. Process Text and Labels
	processed_articles = self.process_all_articles()

	# 3. Engineer Features
	self.add_features(processed_articles)

	# 4. Create Data Splits
	train_data, val_data = self.create_splits(processed_articles)

	# 5. Prepare Model Inputs
	train_inputs = self.prepare_model_inputs(train_data)
	val_inputs = self.prepare_model_inputs(val_data)

	self.logger.info("Data processing complete!")

	return {
	'train': train_inputs,
	'val': val_inputs,
	'label_encodings': self.label_encodings,
	'stats': self.get_statistics()
	}

	def load_data(self):
	"""Load and prepare the annotation data"""
	self.logger.info(f"Loading annotations from {self.annotations_file}")

	# Load annotations file
	self.df = pd.read_csv(
	self.annotations_file,
	sep='\t',
	names=['article_id', 'narratives', 'subnarratives']
	)

	# Create label encodings
	all_subnarratives = set()
	for subnarrs in self.df['subnarratives'].str.split(';'):
	all_subnarratives.update(subnarrs)

	self.label_encodings = {
	label: idx for idx, label in enumerate(sorted(all_subnarratives))
	}

	self.logger.info(f"Loaded {len(self.df)} articles with {len(self.label_encodings)} unique labels")

	def read_article(self, article_id: str) -> str:
	"""Read article content from file"""
	try:
	with open(self.raw_dir / article_id, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception as e:
	self.logger.error(f"Error reading article {article_id}: {e}")
	return ""

	def process_text(self, text: str) -> str:
	"""Enhanced text processing"""
	# Remove URLs and emails
	text = re.sub(r'http\S+\|www\S+\|\S+@\S+', '', text)

	# Normalize whitespace
	text = ' '.join(text.split())

	# Handle numbers and special characters
	text = re.sub(r'\d+', ' NUM ', text)
	text = re.sub(r'[^\w\s.,!?-]', ' ', text)

	return text.strip()

	def extract_features(self, text: str) -> Dict:
	"""Extract rich text features using SpaCy."""
	# Process text with SpaCy
	doc = self.nlp(text)
	words = [token.text for token in doc if not token.is_space]
	sentences = list(doc.sents)

	return {
	'length': len(words),
	'avg_word_length': np.mean([len(w) for w in words]),
	'sentence_count': len(sentences),
	'avg_sentence_length': len(words) / len(sentences) if sentences else 0,
	'unique_words': len(set(words)),
	'density': len(set(words)) / len(words) if words else 0
	}

	def process_all_articles(self) -> List[Dict]:
	"""Process all articles with rich features"""
	processed_articles = []

	for _, row in tqdm(self.df.iterrows(), desc="Processing articles"):
	# Read and process text
	text = self.read_article(row['article_id'])
	processed_text = self.process_text(text)

	# Extract features
	features = self.extract_features(processed_text)

	# Process labels
	labels = self.process_labels(row['subnarratives'])

	processed_articles.append({
	'id': row['article_id'],
	'text': processed_text,
	'features': features,
	'labels': labels,
	'domain': 'UA' if 'UA' in row['article_id'] else 'CC'
	})

	return processed_articles

	def process_labels(self, subnarratives: str) -> List[int]:
	"""Convert subnarratives string to label vector"""
	label_vector = [0] * len(self.label_encodings)
	for subnarr in subnarratives.split(';'):
	if subnarr in self.label_encodings:
	label_vector[self.label_encodings[subnarr]] = 1
	return label_vector

	def add_features(self, articles: List[Dict]):
	"""Add TF-IDF and additional features"""
	# Create TF-IDF features
	self.tfidf_vectorizer = TfidfVectorizer(
	max_features=5000,
	stop_words='english'
	)

	texts = [article['text'] for article in articles]
	tfidf_features = self.tfidf_vectorizer.fit_transform(texts)

	# Add to articles
	for idx, article in enumerate(articles):
	article['tfidf_features'] = tfidf_features[idx]

	def create_splits(self, articles: List[Dict]) -> Tuple[List[Dict], List[Dict]]:
	"""Create stratified splits"""
	# Use domain and label distribution for stratification
	stratify_labels = [f"{a['domain']}_{'-'.join(str(l) for l in a['labels'])}"
	for a in articles]

	skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
	train_idx, val_idx = next(skf.split(articles, stratify_labels))

	return [articles[i] for i in train_idx], [articles[i] for i in val_idx]

	def prepare_model_inputs(self, articles: List[Dict]) -> Dict[str, torch.Tensor]:
	"""Prepare inputs for the model"""
	# Tokenize texts
	encodings = self.tokenizer(
	[a['text'] for a in articles],
	padding=True,
	truncation=True,
	max_length=512,
	return_tensors='pt'
	)

	# Convert labels to tensor
	labels = torch.tensor([a['labels'] for a in articles])

	# Convert features to tensor with explicit float32 dtype
	features = torch.tensor([[
	a['features']['length'],
	a['features']['avg_word_length'],
	a['features']['sentence_count'],
	a['features']['avg_sentence_length'],
	a['features']['density']
	] for a in articles], dtype=torch.float32) # Specify float32 dtype

	return {
	'input_ids': encodings['input_ids'],
	'attention_mask': encodings['attention_mask'],
	'labels': labels,
	'features': features
	}

	def get_label_distribution(self) -> Dict:
	"""Calculate the distribution of labels in the dataset"""
	if self.df is None:
	return {}

	label_counts = {}
	for subnarrs in self.df['subnarratives'].str.split(';'):
	for subnarr in subnarrs:
	if subnarr in self.label_encodings:
	label_counts[subnarr] = label_counts.get(subnarr, 0) + 1

	return label_counts

	def get_statistics(self) -> Dict:
	"""Get processing statistics"""
	return {
	'total_articles': len(self.df),
	'label_distribution': self.get_label_distribution(),
	'vocabulary_size': len(self.tfidf_vectorizer.vocabulary_),
	'domain_distribution': self.df['article_id'].apply(
	lambda x: 'UA' if 'UA' in x else 'CC'
	).value_counts().to_dict()
	}

	def analyze_features(self, processed_data: Dict) -> Dict:
	"""Analyze feature statistics from processed data"""
	train_features = processed_data['train']['features']
	feature_names = ['length', 'avg_word_length', 'sentence_count',
	'avg_sentence_length', 'density']

	feature_stats = {}
	for i, name in enumerate(feature_names):
	values = train_features[:, i]
	feature_stats[name] = {
	'mean': float(values.mean()),
	'std': float(values.std()),
	'min': float(values.min()),
	'max': float(values.max())
	}

	return feature_stats

	# Usage example
	if __name__ == "__main__":
	processor = AdvancedNarrativeProcessor(
	annotations_file="../../data/subtask-2-annotations.txt",
	raw_dir="../../data/raw"
	)

	processed_data = processor.load_and_process_data()

	# Print statistics
	stats = processed_data['stats']
	print("\n=== Processing Statistics ===")
	print(f"Total Articles: {stats['total_articles']}")
	print(f"Vocabulary Size: {stats['vocabulary_size']}")
	print("\nDomain Distribution:")
	for domain, count in stats['domain_distribution'].items():
	print(f"{domain}: {count} articles")

	# Print feature analysis
	feature_stats = processor.analyze_features(processed_data)
	print("\n=== Feature Statistics ===")
	for name, stats in feature_stats.items():
	print(f"{name}:")
	print(f" Mean: {stats['mean']:.2f}")
	print(f" Std: {stats['std']:.2f}")
	print(f" Range: [{stats['min']:.2f}, {stats['max']:.2f}]")