sumuks commited on
Commit
3a6b206
·
verified ·
1 Parent(s): 5c51375

Create data_loader.py

Browse files
Files changed (1) hide show
  1. data_loader.py +36 -0
data_loader.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data loading module for HuggingFace datasets."""
2
+
3
+ from datasets import load_dataset
4
+ from functools import cache
5
+ from typing import Any
6
+ import logging
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ @cache
14
+ def load_sample_dataset() -> Any:
15
+ """Load the essential-web dataset sample."""
16
+ try:
17
+ logger.info("Loading dataset from HuggingFace...")
18
+ dataset = load_dataset("sumuks/essential-web-v1.0-sample-1M-with-cleaned-text")
19
+ logger.info(f"Dataset loaded successfully with {len(dataset['train'])} samples")
20
+ return dataset
21
+ except Exception as e:
22
+ logger.error(f"Failed to load dataset: {e}")
23
+ raise
24
+
25
+
26
+ def get_dataset_size() -> int:
27
+ """Get total number of samples in the dataset."""
28
+ dataset = load_sample_dataset()
29
+ return len(dataset['train'])
30
+
31
+
32
+ def get_sample(index: int) -> tuple[str, str]:
33
+ """Get original and cleaned text for a specific sample."""
34
+ dataset = load_sample_dataset()
35
+ sample = dataset['train'][index]
36
+ return sample['text'], sample['cleaned_text']