GenAIDevTOProd commited on
Commit
2e057a8
·
verified ·
1 Parent(s): 23f0dfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -10,9 +10,6 @@ import faiss
10
  import gradio as gr
11
  from sklearn.metrics.pairwise import cosine_similarity
12
  from huggingface_hub import hf_hub_download, login
13
- from pyspark.sql import SparkSession
14
- from pyspark.sql.functions import col, udf, monotonically_increasing_id, collect_list, concat_ws
15
- from pyspark.sql.types import StringType
16
  from huggingface_hub import HfApi
17
 
18
  # Load token from Hugging Face Secrets
@@ -34,12 +31,15 @@ def load_reddit_split(subreddit_name):
34
  # Combine subreddit data
35
  combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
36
 
37
- if "JAVA_HOME" not in os.environ:
38
- os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64" # Common path on HF Spaces/Debian
 
 
 
 
39
 
40
- # PySpark session
41
- spark = SparkSession.builder.getOrCreate()
42
- df = spark.createDataFrame([{"body": ex["body"]} for ex in islice(combined_dataset, 100000)])
43
 
44
  # Clean text function
45
  def clean_body(text):
@@ -48,17 +48,17 @@ def clean_body(text):
48
  text = re.sub(r"[^a-zA-Z\s]", "", text)
49
  return re.sub(r"\s+", " ", text).strip()
50
 
51
- clean_udf = udf(clean_body, StringType())
52
- df_clean = df.withColumn("clean", clean_udf(col("body")))
53
 
54
- # Chunking
55
  chunk_size = 5
56
- df_indexed = df_clean.withColumn("row_num", monotonically_increasing_id())
57
- df_indexed = df_indexed.withColumn("chunk_id", (col("row_num") / chunk_size).cast("int"))
58
- df_chunked = df_indexed.groupBy("chunk_id").agg(concat_ws(" ", collect_list("clean")).alias("chunk_text"))
59
 
60
- # Collect for embedding
61
- chunked_comments = df_chunked.select("chunk_text").rdd.map(lambda x: x[0]).collect()
62
 
63
  # Create subreddit labels
64
  combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
 
10
  import gradio as gr
11
  from sklearn.metrics.pairwise import cosine_similarity
12
  from huggingface_hub import hf_hub_download, login
 
 
 
13
  from huggingface_hub import HfApi
14
 
15
  # Load token from Hugging Face Secrets
 
31
  # Combine subreddit data
32
  combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
33
 
34
+ import pandas as pd
35
+ import re
36
+ from itertools import islice
37
+
38
+ # Load a sample of the dataset (e.g., 100,000 records for performance)
39
+ comments = [{"body": ex["body"]} for ex in islice(combined_dataset, 100000)]
40
 
41
+ # Convert to DataFrame
42
+ df = pd.DataFrame(comments)
 
43
 
44
  # Clean text function
45
  def clean_body(text):
 
48
  text = re.sub(r"[^a-zA-Z\s]", "", text)
49
  return re.sub(r"\s+", " ", text).strip()
50
 
51
+ # Apply cleaning
52
+ df["clean"] = df["body"].apply(clean_body)
53
 
54
+ # Chunk every 5 rows
55
  chunk_size = 5
56
+ df["chunk_id"] = df.index // chunk_size
57
+ df_chunked = df.groupby("chunk_id")["clean"].apply(lambda texts: " ".join(texts)).reset_index()
58
+ df_chunked.rename(columns={"clean": "chunk_text"}, inplace=True)
59
 
60
+ # Final list for embedding
61
+ chunked_comments = df_chunked["chunk_text"].tolist()
62
 
63
  # Create subreddit labels
64
  combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))