Spaces:
Runtime error
Runtime error
import pandas as pd | |
from collections import Counter | |
import os | |
def load_annotations(file_path): | |
"""Load and parse the annotations file""" | |
df = pd.read_csv(file_path, sep='\t', | |
names=['article_id', 'narratives', 'subnarratives']) | |
return df | |
def analyze_distribution(df): | |
"""Analyze distribution of narratives and subnarratives""" | |
# Count domains | |
domains = df['article_id'].apply(lambda x: 'CC' if 'CC' in x else 'UA').value_counts() | |
# Split multiple narratives/subnarratives | |
all_narratives = [n for narr in df['narratives'].str.split(';') for n in narr] | |
all_subnarratives = [s for sub in df['subnarratives'].str.split(';') for s in sub] | |
# Count frequencies | |
narrative_counts = Counter(all_narratives) | |
subnarrative_counts = Counter(all_subnarratives) | |
return { | |
'domains': domains, | |
'narrative_counts': narrative_counts, | |
'subnarrative_counts': subnarrative_counts | |
} | |
def get_article_stats(raw_dir): | |
"""Get statistics about article lengths""" | |
lengths = [] | |
for filename in os.listdir(raw_dir): | |
with open(os.path.join(raw_dir, filename), 'r', encoding='utf-8') as f: | |
text = f.read() | |
lengths.append(len(text.split())) | |
return { | |
'mean_length': sum(lengths) / len(lengths), | |
'max_length': max(lengths), | |
'min_length': min(lengths) | |
} | |
if __name__ == "__main__": | |
# Adjust these paths according to your structure | |
annotations_file = "../../data/subtask-2-annotations.txt" | |
raw_dir = "../../data/raw" | |
# Load and analyze | |
df = load_annotations(annotations_file) | |
stats = analyze_distribution(df) | |
article_stats = get_article_stats(raw_dir) | |
# Print results | |
print("\n=== Domain Distribution ===") | |
print(stats['domains']) | |
print("\n=== Top 5 Narratives ===") | |
for narr, count in stats['narrative_counts'].most_common(5): | |
print(f"{narr}: {count}") | |
print("\n=== Top 5 Subnarratives ===") | |
for sub, count in stats['subnarrative_counts'].most_common(5): | |
print(f"{sub}: {count}") | |
print("\n=== Article Statistics ===") | |
print(f"Average length: {article_stats['mean_length']:.2f} words") | |
print(f"Max length: {article_stats['max_length']} words") | |
print(f"Min length: {article_stats['min_length']} words") |