Update app.py
Browse files
app.py
CHANGED
@@ -10,9 +10,6 @@ import faiss
|
|
10 |
import gradio as gr
|
11 |
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
from huggingface_hub import hf_hub_download, login
|
13 |
-
from pyspark.sql import SparkSession
|
14 |
-
from pyspark.sql.functions import col, udf, monotonically_increasing_id, collect_list, concat_ws
|
15 |
-
from pyspark.sql.types import StringType
|
16 |
from huggingface_hub import HfApi
|
17 |
|
18 |
# Load token from Hugging Face Secrets
|
@@ -34,12 +31,15 @@ def load_reddit_split(subreddit_name):
|
|
34 |
# Combine subreddit data
|
35 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|
36 |
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
#
|
41 |
-
|
42 |
-
df = spark.createDataFrame([{"body": ex["body"]} for ex in islice(combined_dataset, 100000)])
|
43 |
|
44 |
# Clean text function
|
45 |
def clean_body(text):
|
@@ -48,17 +48,17 @@ def clean_body(text):
|
|
48 |
text = re.sub(r"[^a-zA-Z\s]", "", text)
|
49 |
return re.sub(r"\s+", " ", text).strip()
|
50 |
|
51 |
-
|
52 |
-
|
53 |
|
54 |
-
#
|
55 |
chunk_size = 5
|
56 |
-
|
57 |
-
|
58 |
-
df_chunked
|
59 |
|
60 |
-
#
|
61 |
-
chunked_comments = df_chunked
|
62 |
|
63 |
# Create subreddit labels
|
64 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|
|
|
10 |
import gradio as gr
|
11 |
from sklearn.metrics.pairwise import cosine_similarity
|
12 |
from huggingface_hub import hf_hub_download, login
|
|
|
|
|
|
|
13 |
from huggingface_hub import HfApi
|
14 |
|
15 |
# Load token from Hugging Face Secrets
|
|
|
31 |
# Combine subreddit data
|
32 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|
33 |
|
34 |
+
import pandas as pd
|
35 |
+
import re
|
36 |
+
from itertools import islice
|
37 |
+
|
38 |
+
# Load a sample of the dataset (e.g., 100,000 records for performance)
|
39 |
+
comments = [{"body": ex["body"]} for ex in islice(combined_dataset, 100000)]
|
40 |
|
41 |
+
# Convert to DataFrame
|
42 |
+
df = pd.DataFrame(comments)
|
|
|
43 |
|
44 |
# Clean text function
|
45 |
def clean_body(text):
|
|
|
48 |
text = re.sub(r"[^a-zA-Z\s]", "", text)
|
49 |
return re.sub(r"\s+", " ", text).strip()
|
50 |
|
51 |
+
# Apply cleaning
|
52 |
+
df["clean"] = df["body"].apply(clean_body)
|
53 |
|
54 |
+
# Chunk every 5 rows
|
55 |
chunk_size = 5
|
56 |
+
df["chunk_id"] = df.index // chunk_size
|
57 |
+
df_chunked = df.groupby("chunk_id")["clean"].apply(lambda texts: " ".join(texts)).reset_index()
|
58 |
+
df_chunked.rename(columns={"clean": "chunk_text"}, inplace=True)
|
59 |
|
60 |
+
# Final list for embedding
|
61 |
+
chunked_comments = df_chunked["chunk_text"].tolist()
|
62 |
|
63 |
# Create subreddit labels
|
64 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|