Spaces:
Sleeping
Sleeping
import kagglehub | |
from pathlib import Path | |
import shutil | |
import pandas as pd | |
import re | |
import nltk | |
from typing import List, Dict | |
from tqdm import tqdm | |
def count_hindi_words(text: str) -> int: | |
"""Count words in Hindi text.""" | |
words = text.strip().split() | |
hindi_words = [w for w in words if re.search(r'[\u0900-\u097F]', w)] | |
return len(hindi_words) | |
def create_dataframe_from_files(downloaded_paths: List[str]) -> pd.DataFrame: | |
"""Create a DataFrame from downloaded text files.""" | |
print("\nCreating DataFrame from text files...") | |
data = [] | |
for file_path in tqdm(downloaded_paths): | |
if file_path.endswith('.txt'): | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read().strip() | |
# Split into title and text (assuming first line is title) | |
lines = content.split('\n', 1) | |
title = lines[0].strip() | |
text = lines[1].strip() if len(lines) > 1 else "" | |
data.append({ | |
'title': title, | |
'text': text, | |
'word_count': count_hindi_words(content) | |
}) | |
except Exception as e: | |
print(f"Error reading file {file_path}: {e}") | |
continue | |
df = pd.DataFrame(data) | |
print(f"Created DataFrame with {len(df)} articles") | |
return df | |
def process_and_split_articles(df: pd.DataFrame, | |
output_dir: Path, | |
train_ratio: float = 0.8, | |
min_words: int = 100, | |
max_words: int = 5000) -> Dict[str, int]: | |
"""Process articles and split them into files based on word count.""" | |
# Create output directories | |
train_dir = output_dir / "train" | |
valid_dir = output_dir / "valid" | |
train_dir.mkdir(exist_ok=True) | |
valid_dir.mkdir(exist_ok=True) | |
stats = {'train': 0, 'valid': 0, 'skipped': 0} | |
print("\nProcessing articles...") | |
for _, row in tqdm(df.iterrows(), total=len(df)): | |
try: | |
# Skip if too short or too long | |
if row['word_count'] < min_words or row['word_count'] > max_words: | |
stats['skipped'] += 1 | |
continue | |
# Combine title and text | |
full_text = f"{row['title']}\n\n{row['text']}" | |
# Decide split (train or valid) | |
is_train = pd.np.random.random() < train_ratio | |
output_dir = train_dir if is_train else valid_dir | |
# Save to file named by word count | |
file_path = output_dir / f"{row['word_count']}.txt" | |
suffix = 1 | |
while file_path.exists(): | |
file_path = output_dir / f"{row['word_count']}_{suffix}.txt" | |
suffix += 1 | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(full_text) | |
if is_train: | |
stats['train'] += 1 | |
else: | |
stats['valid'] += 1 | |
except Exception as e: | |
print(f"Error processing article: {e}") | |
stats['skipped'] += 1 | |
continue | |
return stats | |
def download_hindi_wikipedia_dataset(): | |
"""Download and process Hindi Wikipedia dataset.""" | |
print("Starting dataset download...") | |
try: | |
# Download the dataset using kagglehub | |
downloaded_paths = kagglehub.dataset_download( | |
"disisbig/hindi-wikipedia-articles-172k" | |
) | |
print("Dataset downloaded successfully!") | |
print("Downloaded files:", downloaded_paths) | |
# Create data directory | |
data_dir = Path("data") | |
data_dir.mkdir(exist_ok=True) | |
# Create DataFrame from downloaded files | |
df = create_dataframe_from_files(downloaded_paths) | |
# Save DataFrame for future use | |
df.to_parquet(data_dir / "articles.parquet") | |
print(f"Saved DataFrame to {data_dir / 'articles.parquet'}") | |
# Process and split the articles | |
stats = process_and_split_articles(df, data_dir) | |
# Print statistics | |
print("\nProcessing completed:") | |
print(f"Train files: {stats['train']}") | |
print(f"Validation files: {stats['valid']}") | |
print(f"Skipped articles: {stats['skipped']}") | |
# Get file sizes | |
train_size = sum(f.stat().st_size for f in (data_dir / "train").glob("*.txt")) | |
valid_size = sum(f.stat().st_size for f in (data_dir / "valid").glob("*.txt")) | |
print(f"\nTotal size:") | |
print(f"Train: {train_size / (1024*1024):.2f} MB") | |
print(f"Validation: {valid_size / (1024*1024):.2f} MB") | |
return True | |
except Exception as e: | |
print(f"Error downloading/processing dataset: {e}") | |
return False | |
def verify_dataset_structure(): | |
"""Verify the dataset directory structure and files.""" | |
data_dir = Path("data") | |
if not data_dir.exists(): | |
print("Error: Data directory not found!") | |
return False | |
# Check if we have the processed DataFrame | |
parquet_file = data_dir / "articles.parquet" | |
if parquet_file.exists(): | |
df = pd.read_parquet(parquet_file) | |
print(f"\nArticles DataFrame:") | |
print(f"Total articles: {len(df)}") | |
# print(f"Word count range: {df['word_count'].min()} - {df['word_count'].max()}") | |
for split in ['train', 'valid']: | |
split_dir = data_dir / split | |
if not split_dir.exists(): | |
print(f"Error: {split} directory not found!") | |
return False | |
txt_files = list(split_dir.glob("*.txt")) | |
if not txt_files: | |
print(f"Error: No text files found in {split} directory!") | |
return False | |
print(f"\n{split.upper()} split:") | |
print(f"Number of files: {len(txt_files)}") | |
word_counts = [int(f.stem.split('_')[0]) for f in txt_files] | |
print(f"Word count range: {min(word_counts)} - {max(word_counts)}") | |
return True | |
if __name__ == "__main__": | |
# Download and process the dataset | |
success = download_hindi_wikipedia_dataset() | |
if success: | |
print("\nVerifying dataset structure...") | |
verify_dataset_structure() |