File size: 2,375 Bytes
fb2cd67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
from collections import Counter
import os

def load_annotations(file_path):
    """Load and parse the annotations file"""
    df = pd.read_csv(file_path, sep='\t', 
                     names=['article_id', 'narratives', 'subnarratives'])
    return df

def analyze_distribution(df):
    """Analyze distribution of narratives and subnarratives"""
    # Count domains
    domains = df['article_id'].apply(lambda x: 'CC' if 'CC' in x else 'UA').value_counts()
    
    # Split multiple narratives/subnarratives
    all_narratives = [n for narr in df['narratives'].str.split(';') for n in narr]
    all_subnarratives = [s for sub in df['subnarratives'].str.split(';') for s in sub]
    
    # Count frequencies
    narrative_counts = Counter(all_narratives)
    subnarrative_counts = Counter(all_subnarratives)
    
    return {
        'domains': domains,
        'narrative_counts': narrative_counts,
        'subnarrative_counts': subnarrative_counts
    }

def get_article_stats(raw_dir):
    """Get statistics about article lengths"""
    lengths = []
    for filename in os.listdir(raw_dir):
        with open(os.path.join(raw_dir, filename), 'r', encoding='utf-8') as f:
            text = f.read()
            lengths.append(len(text.split()))
    
    return {
        'mean_length': sum(lengths) / len(lengths),
        'max_length': max(lengths),
        'min_length': min(lengths)
    }

if __name__ == "__main__":
    # Adjust these paths according to your structure
    annotations_file = "../../data/subtask-2-annotations.txt"
    raw_dir = "../../data/raw"
    
    # Load and analyze
    df = load_annotations(annotations_file)
    stats = analyze_distribution(df)
    article_stats = get_article_stats(raw_dir)
    
    # Print results
    print("\n=== Domain Distribution ===")
    print(stats['domains'])
    
    print("\n=== Top 5 Narratives ===")
    for narr, count in stats['narrative_counts'].most_common(5):
        print(f"{narr}: {count}")
    
    print("\n=== Top 5 Subnarratives ===")
    for sub, count in stats['subnarrative_counts'].most_common(5):
        print(f"{sub}: {count}")
    
    print("\n=== Article Statistics ===")
    print(f"Average length: {article_stats['mean_length']:.2f} words")
    print(f"Max length: {article_stats['max_length']} words")
    print(f"Min length: {article_stats['min_length']} words")