GenAIDevTOProd commited on
Commit
11db2b7
·
verified ·
1 Parent(s): 0442d65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -18,17 +18,22 @@ loading and combining all the iterables together.
18
  from huggingface_hub import hf_hub_url, cached_download
19
  import json
20
 
 
 
 
 
21
  target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
22
 
23
- # Load and stream each subreddit split individually
24
- datasets = [
25
- load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True)
26
- for sub in target_subreddits
27
- ]
 
 
28
 
29
- # Combine into one iterable dataset
30
- from itertools import chain
31
- combined_dataset = chain(*datasets)
32
 
33
  """# Chunking Logic
34
  - Group Reddit comments into small textual chunks to create a unit of meaning for embedding.
 
18
  from huggingface_hub import hf_hub_url, cached_download
19
  import json
20
 
21
+ from huggingface_hub import hf_hub_url, cached_download
22
+ import json
23
+ from itertools import chain
24
+
25
  target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
26
 
27
+ def load_reddit_split(subreddit_name):
28
+ """Stream Reddit comments from a specific subreddit split"""
29
+ file_url = hf_hub_url(repo_id="HuggingFaceGECLM/REDDIT_comments", filename=f"{subreddit_name}.jsonl")
30
+ file_path = cached_download(file_url)
31
+ with open(file_path, "r") as f:
32
+ for line in f:
33
+ yield json.loads(line)
34
 
35
+ # Combine all selected subreddit streams into one iterable
36
+ combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
 
37
 
38
  """# Chunking Logic
39
  - Group Reddit comments into small textual chunks to create a unit of meaning for embedding.