Spaces:

eerrffuunn
/

newsemeval

Runtime error

newsemeval / scripts /analysis /data_stats.py

Mohammaderfan koupaei

Add application file

fb2cd67 4 months ago

2.38 kB

	import pandas as pd
	from collections import Counter
	import os

	def load_annotations(file_path):
	"""Load and parse the annotations file"""
	df = pd.read_csv(file_path, sep='\t',
	names=['article_id', 'narratives', 'subnarratives'])
	return df

	def analyze_distribution(df):
	"""Analyze distribution of narratives and subnarratives"""
	# Count domains
	domains = df['article_id'].apply(lambda x: 'CC' if 'CC' in x else 'UA').value_counts()

	# Split multiple narratives/subnarratives
	all_narratives = [n for narr in df['narratives'].str.split(';') for n in narr]
	all_subnarratives = [s for sub in df['subnarratives'].str.split(';') for s in sub]

	# Count frequencies
	narrative_counts = Counter(all_narratives)
	subnarrative_counts = Counter(all_subnarratives)

	return {
	'domains': domains,
	'narrative_counts': narrative_counts,
	'subnarrative_counts': subnarrative_counts
	}

	def get_article_stats(raw_dir):
	"""Get statistics about article lengths"""
	lengths = []
	for filename in os.listdir(raw_dir):
	with open(os.path.join(raw_dir, filename), 'r', encoding='utf-8') as f:
	text = f.read()
	lengths.append(len(text.split()))

	return {
	'mean_length': sum(lengths) / len(lengths),
	'max_length': max(lengths),
	'min_length': min(lengths)
	}

	if __name__ == "__main__":
	# Adjust these paths according to your structure
	annotations_file = "../../data/subtask-2-annotations.txt"
	raw_dir = "../../data/raw"

	# Load and analyze
	df = load_annotations(annotations_file)
	stats = analyze_distribution(df)
	article_stats = get_article_stats(raw_dir)

	# Print results
	print("\n=== Domain Distribution ===")
	print(stats['domains'])

	print("\n=== Top 5 Narratives ===")
	for narr, count in stats['narrative_counts'].most_common(5):
	print(f"{narr}: {count}")

	print("\n=== Top 5 Subnarratives ===")
	for sub, count in stats['subnarrative_counts'].most_common(5):
	print(f"{sub}: {count}")

	print("\n=== Article Statistics ===")
	print(f"Average length: {article_stats['mean_length']:.2f} words")
	print(f"Max length: {article_stats['max_length']} words")
	print(f"Min length: {article_stats['min_length']} words")