import pandas as pd from collections import Counter import os def load_annotations(file_path): """Load and parse the annotations file""" df = pd.read_csv(file_path, sep='\t', names=['article_id', 'narratives', 'subnarratives']) return df def analyze_distribution(df): """Analyze distribution of narratives and subnarratives""" # Count domains domains = df['article_id'].apply(lambda x: 'CC' if 'CC' in x else 'UA').value_counts() # Split multiple narratives/subnarratives all_narratives = [n for narr in df['narratives'].str.split(';') for n in narr] all_subnarratives = [s for sub in df['subnarratives'].str.split(';') for s in sub] # Count frequencies narrative_counts = Counter(all_narratives) subnarrative_counts = Counter(all_subnarratives) return { 'domains': domains, 'narrative_counts': narrative_counts, 'subnarrative_counts': subnarrative_counts } def get_article_stats(raw_dir): """Get statistics about article lengths""" lengths = [] for filename in os.listdir(raw_dir): with open(os.path.join(raw_dir, filename), 'r', encoding='utf-8') as f: text = f.read() lengths.append(len(text.split())) return { 'mean_length': sum(lengths) / len(lengths), 'max_length': max(lengths), 'min_length': min(lengths) } if __name__ == "__main__": # Adjust these paths according to your structure annotations_file = "../../data/subtask-2-annotations.txt" raw_dir = "../../data/raw" # Load and analyze df = load_annotations(annotations_file) stats = analyze_distribution(df) article_stats = get_article_stats(raw_dir) # Print results print("\n=== Domain Distribution ===") print(stats['domains']) print("\n=== Top 5 Narratives ===") for narr, count in stats['narrative_counts'].most_common(5): print(f"{narr}: {count}") print("\n=== Top 5 Subnarratives ===") for sub, count in stats['subnarrative_counts'].most_common(5): print(f"{sub}: {count}") print("\n=== Article Statistics ===") print(f"Average length: {article_stats['mean_length']:.2f} words") print(f"Max length: {article_stats['max_length']} words") print(f"Min length: {article_stats['min_length']} words")