Spaces:

aayushraina
/

bye-hindi

Sleeping

File size: 6,471 Bytes

46759b2

import kagglehub
from pathlib import Path
import shutil
import pandas as pd
import re
import nltk
from typing import List, Dict
from tqdm import tqdm

def count_hindi_words(text: str) -> int:
    """Count words in Hindi text."""
    words = text.strip().split()
    hindi_words = [w for w in words if re.search(r'[\u0900-\u097F]', w)]
    return len(hindi_words)

def create_dataframe_from_files(downloaded_paths: List[str]) -> pd.DataFrame:
    """Create a DataFrame from downloaded text files."""
    print("\nCreating DataFrame from text files...")
    
    data = []
    for file_path in tqdm(downloaded_paths):
        if file_path.endswith('.txt'):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    
                # Split into title and text (assuming first line is title)
                lines = content.split('\n', 1)
                title = lines[0].strip()
                text = lines[1].strip() if len(lines) > 1 else ""
                
                data.append({
                    'title': title,
                    'text': text,
                    'word_count': count_hindi_words(content)
                })
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")
                continue
    
    df = pd.DataFrame(data)
    print(f"Created DataFrame with {len(df)} articles")
    return df

def process_and_split_articles(df: pd.DataFrame, 
                             output_dir: Path,
                             train_ratio: float = 0.8,
                             min_words: int = 100,
                             max_words: int = 5000) -> Dict[str, int]:
    """Process articles and split them into files based on word count."""
    
    # Create output directories
    train_dir = output_dir / "train"
    valid_dir = output_dir / "valid"
    train_dir.mkdir(exist_ok=True)
    valid_dir.mkdir(exist_ok=True)
    
    stats = {'train': 0, 'valid': 0, 'skipped': 0}
    
    print("\nProcessing articles...")
    for _, row in tqdm(df.iterrows(), total=len(df)):
        try:
            # Skip if too short or too long
            if row['word_count'] < min_words or row['word_count'] > max_words:
                stats['skipped'] += 1
                continue
            
            # Combine title and text
            full_text = f"{row['title']}\n\n{row['text']}"
            
            # Decide split (train or valid)
            is_train = pd.np.random.random() < train_ratio
            output_dir = train_dir if is_train else valid_dir
            
            # Save to file named by word count
            file_path = output_dir / f"{row['word_count']}.txt"
            suffix = 1
            while file_path.exists():
                file_path = output_dir / f"{row['word_count']}_{suffix}.txt"
                suffix += 1
                
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(full_text)
            
            if is_train:
                stats['train'] += 1
            else:
                stats['valid'] += 1
                
        except Exception as e:
            print(f"Error processing article: {e}")
            stats['skipped'] += 1
            continue
    
    return stats

def download_hindi_wikipedia_dataset():
    """Download and process Hindi Wikipedia dataset."""
    print("Starting dataset download...")
    
    try:
        # Download the dataset using kagglehub
        downloaded_paths = kagglehub.dataset_download(
            "disisbig/hindi-wikipedia-articles-172k"
        )
        
        print("Dataset downloaded successfully!")
        print("Downloaded files:", downloaded_paths)
        
        # Create data directory
        data_dir = Path("data")
        data_dir.mkdir(exist_ok=True)
        
        # Create DataFrame from downloaded files
        df = create_dataframe_from_files(downloaded_paths)
        
        # Save DataFrame for future use
        df.to_parquet(data_dir / "articles.parquet")
        print(f"Saved DataFrame to {data_dir / 'articles.parquet'}")
        
        # Process and split the articles
        stats = process_and_split_articles(df, data_dir)
        
        # Print statistics
        print("\nProcessing completed:")
        print(f"Train files: {stats['train']}")
        print(f"Validation files: {stats['valid']}")
        print(f"Skipped articles: {stats['skipped']}")
        
        # Get file sizes
        train_size = sum(f.stat().st_size for f in (data_dir / "train").glob("*.txt"))
        valid_size = sum(f.stat().st_size for f in (data_dir / "valid").glob("*.txt"))
        
        print(f"\nTotal size:")
        print(f"Train: {train_size / (1024*1024):.2f} MB")
        print(f"Validation: {valid_size / (1024*1024):.2f} MB")
        
        return True
            
    except Exception as e:
        print(f"Error downloading/processing dataset: {e}")
        return False

def verify_dataset_structure():
    """Verify the dataset directory structure and files."""
    data_dir = Path("data")
    
    if not data_dir.exists():
        print("Error: Data directory not found!")
        return False
    
    # Check if we have the processed DataFrame
    parquet_file = data_dir / "articles.parquet"
    if parquet_file.exists():
        df = pd.read_parquet(parquet_file)
        print(f"\nArticles DataFrame:")
        print(f"Total articles: {len(df)}")
        # print(f"Word count range: {df['word_count'].min()} - {df['word_count'].max()}")
    
    for split in ['train', 'valid']:
        split_dir = data_dir / split
        if not split_dir.exists():
            print(f"Error: {split} directory not found!")
            return False
        
        txt_files = list(split_dir.glob("*.txt"))
        if not txt_files:
            print(f"Error: No text files found in {split} directory!")
            return False
        
        print(f"\n{split.upper()} split:")
        print(f"Number of files: {len(txt_files)}")
        word_counts = [int(f.stem.split('_')[0]) for f in txt_files]
        print(f"Word count range: {min(word_counts)} - {max(word_counts)}")
    
    return True

if __name__ == "__main__":
    # Download and process the dataset
    success = download_hindi_wikipedia_dataset()
    
    if success:
        print("\nVerifying dataset structure...")
        verify_dataset_structure()