Spaces:

GenAIDevTOProd
/

Reddit-SemanticSearch-Prototype

Sleeping

GenAIDevTOProd commited on 20 days ago

Commit

11db2b7

verified ·

1 Parent(s): 0442d65

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -18,17 +18,22 @@ loading and combining all the iterables together.
 from huggingface_hub import hf_hub_url, cached_download
 import json
 target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
-# Load and stream each subreddit split individually
-datasets = [
-    load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True)
-    for sub in target_subreddits
-]
-# Combine into one iterable dataset
-from itertools import chain
-combined_dataset = chain(*datasets)
 """# Chunking Logic
 - Group Reddit comments into small textual chunks to create a unit of meaning for embedding.

 from huggingface_hub import hf_hub_url, cached_download
 import json
+from huggingface_hub import hf_hub_url, cached_download
+import json
+from itertools import chain
 target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
+def load_reddit_split(subreddit_name):
+    """Stream Reddit comments from a specific subreddit split"""
+    file_url = hf_hub_url(repo_id="HuggingFaceGECLM/REDDIT_comments", filename=f"{subreddit_name}.jsonl")
+    file_path = cached_download(file_url)
+    with open(file_path, "r") as f:
+        for line in f:
+            yield json.loads(line)
+# Combine all selected subreddit streams into one iterable
+combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
 """# Chunking Logic
 - Group Reddit comments into small textual chunks to create a unit of meaning for embedding.