Update app.py
Browse files
app.py
CHANGED
@@ -33,7 +33,14 @@ combined_dataset = chain(*(stream_subreddit_data(sub) for sub in target_subreddi
|
|
33 |
comments = list(islice(combined_dataset, 100000))
|
34 |
|
35 |
# Extract text and subreddit
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# Clean text function
|
39 |
def clean_body(text):
|
|
|
33 |
comments = list(islice(combined_dataset, 100000))
|
34 |
|
35 |
# Extract text and subreddit
|
36 |
+
comments = []
|
37 |
+
|
38 |
+
for sub in target_subreddits:
|
39 |
+
stream = load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True)
|
40 |
+
comments.extend({"body": ex["body"], "subreddit": sub} for ex in islice(stream, 20000)) # ~100k total
|
41 |
+
|
42 |
+
# Convert to DataFrame
|
43 |
+
df = pd.DataFrame(comments)
|
44 |
|
45 |
# Clean text function
|
46 |
def clean_body(text):
|