GenAIDevTOProd commited on
Commit
96f3209
·
verified ·
1 Parent(s): 324c358

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -33,7 +33,14 @@ combined_dataset = chain(*(stream_subreddit_data(sub) for sub in target_subreddi
33
  comments = list(islice(combined_dataset, 100000))
34
 
35
  # Extract text and subreddit
36
- df = pd.DataFrame([{"body": ex["body"], "subreddit": ex["subreddit"]} for ex in comments])
 
 
 
 
 
 
 
37
 
38
  # Clean text function
39
  def clean_body(text):
 
33
  comments = list(islice(combined_dataset, 100000))
34
 
35
  # Extract text and subreddit
36
+ comments = []
37
+
38
+ for sub in target_subreddits:
39
+ stream = load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True)
40
+ comments.extend({"body": ex["body"], "subreddit": sub} for ex in islice(stream, 20000)) # ~100k total
41
+
42
+ # Convert to DataFrame
43
+ df = pd.DataFrame(comments)
44
 
45
  # Clean text function
46
  def clean_body(text):