newsemeval / scripts /analysis /data_stats.py
Mohammaderfan koupaei
Add application file
fb2cd67
import pandas as pd
from collections import Counter
import os
def load_annotations(file_path):
"""Load and parse the annotations file"""
df = pd.read_csv(file_path, sep='\t',
names=['article_id', 'narratives', 'subnarratives'])
return df
def analyze_distribution(df):
"""Analyze distribution of narratives and subnarratives"""
# Count domains
domains = df['article_id'].apply(lambda x: 'CC' if 'CC' in x else 'UA').value_counts()
# Split multiple narratives/subnarratives
all_narratives = [n for narr in df['narratives'].str.split(';') for n in narr]
all_subnarratives = [s for sub in df['subnarratives'].str.split(';') for s in sub]
# Count frequencies
narrative_counts = Counter(all_narratives)
subnarrative_counts = Counter(all_subnarratives)
return {
'domains': domains,
'narrative_counts': narrative_counts,
'subnarrative_counts': subnarrative_counts
}
def get_article_stats(raw_dir):
"""Get statistics about article lengths"""
lengths = []
for filename in os.listdir(raw_dir):
with open(os.path.join(raw_dir, filename), 'r', encoding='utf-8') as f:
text = f.read()
lengths.append(len(text.split()))
return {
'mean_length': sum(lengths) / len(lengths),
'max_length': max(lengths),
'min_length': min(lengths)
}
if __name__ == "__main__":
# Adjust these paths according to your structure
annotations_file = "../../data/subtask-2-annotations.txt"
raw_dir = "../../data/raw"
# Load and analyze
df = load_annotations(annotations_file)
stats = analyze_distribution(df)
article_stats = get_article_stats(raw_dir)
# Print results
print("\n=== Domain Distribution ===")
print(stats['domains'])
print("\n=== Top 5 Narratives ===")
for narr, count in stats['narrative_counts'].most_common(5):
print(f"{narr}: {count}")
print("\n=== Top 5 Subnarratives ===")
for sub, count in stats['subnarrative_counts'].most_common(5):
print(f"{sub}: {count}")
print("\n=== Article Statistics ===")
print(f"Average length: {article_stats['mean_length']:.2f} words")
print(f"Max length: {article_stats['max_length']} words")
print(f"Min length: {article_stats['min_length']} words")