File size: 4,259 Bytes
46759b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
from pathlib import Path
from typing import List, Dict, Tuple
import random

def load_hindi_dataset(base_path: str = "data", split: str = "train", num_files: int = None) -> str:
    """
    Load Hindi text from dataset with train/validation split structure.
    
    Args:
        base_path: Base directory containing train and validation folders
        split: Either 'train' or 'valid'
        num_files: Number of files to load (None for all files)
    """
    base_dir = Path(base_path)
    split_dir = base_dir / split / split
    
    if not split_dir.exists():
        raise FileNotFoundError(f"Directory not found: {split_dir}")
    
    print(f"\nLoading Hindi dataset from {split_dir}")
    
    # Get all txt files in the directory
    txt_files = list(split_dir.glob("*.txt"))
    
    if not txt_files:
        raise FileNotFoundError(f"No txt files found in {split_dir}")
    
    # Sort files by word count (assuming filenames contain word counts)
    txt_files.sort(key=lambda x: int(x.stem))
    
    # Sample files if num_files is specified
    if num_files is not None:
        if num_files < len(txt_files):
            txt_files = random.sample(txt_files, num_files)
    
    print(f"Found {len(txt_files)} files")
    
    # Load and combine text from files
    texts = []
    total_chars = 0
    total_words = 0
    
    for idx, file_path in enumerate(txt_files, 1):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
                word_count = int(file_path.stem)  # Filename is word count
                texts.append(text)
                total_chars += len(text)
                total_words += word_count
                
                if idx % 10 == 0:
                    print(f"Processed {idx}/{len(txt_files)} files. "
                          f"Total characters: {total_chars:,}, "
                          f"Total words: {total_words:,}")
                
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            continue
    
    combined_text = "\n\n".join(texts)
    
    print(f"\nDataset loading completed:")
    print(f"Total files: {len(texts)}")
    print(f"Total characters: {len(combined_text):,}")
    print(f"Total words: {total_words:,}")
    print(f"Average words per file: {total_words/len(texts):,.1f}")
    
    return combined_text

def get_dataset_stats(base_path: str = "data") -> Dict:
    """Get statistics about the dataset."""
    stats = {}
    for split in ['train', 'valid']:
        split_dir = Path(base_path) / split
        if split_dir.exists():
            txt_files = list(split_dir.glob("*.txt"))
            word_counts = [int(f.stem) for f in txt_files]
            stats[split] = {
                'num_files': len(txt_files),
                'total_words': sum(word_counts),
                'min_words': min(word_counts) if word_counts else 0,
                'max_words': max(word_counts) if word_counts else 0,
                'avg_words': sum(word_counts)/len(word_counts) if word_counts else 0
            }
    return stats

def load_train_valid_split(base_path: str = "data", 
                          train_files: int = None,
                          valid_files: int = None) -> Tuple[str, str]:
    """Load both train and validation splits."""
    train_text = load_hindi_dataset(base_path, "train", train_files)
    valid_text = load_hindi_dataset(base_path, "valid", valid_files)
    return train_text, valid_text

if __name__ == "__main__":
    # Print dataset statistics
    stats = get_dataset_stats()
    print("\nDataset Statistics:")
    print("-" * 50)
    for split, split_stats in stats.items():
        print(f"\n{split.upper()} Split:")
        for key, value in split_stats.items():
            if isinstance(value, (int, float)):
                print(f"{key}: {value:,}")
            else:
                print(f"{key}: {value}")
    
    # Load sample data
    print("\nLoading sample data...")
    train_text, valid_text = load_train_valid_split(train_files=5, valid_files=2)
    print(f"\nSample train text (first 200 chars):\n{train_text[:200]}")
    print(f"\nSample valid text (first 200 chars):\n{valid_text[:200]}")