Spaces:

MoslemBot
/

KajiWeb

Running

App Files Files Community

Bofandra commited on Jul 13

Commit

11133cd

verified ·

1 Parent(s): 9f8903a

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -18

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from sentence_transformers import SentenceTransformer
 from huggingface_hub import InferenceClient, HfApi
 # Hugging Face Space persistence
-HF_REPO_ID = "MoslemBot/kajiweb"  # e.g., "username/your-space-name"
 HF_API_TOKEN = os.getenv("HF_TOKEN")
 api = HfApi()
@@ -34,7 +34,7 @@ def extract_links_and_text(base_url, max_depth=1, visited=None):
     if visited is None:
         visited = set()
     if base_url in visited or max_depth < 0:
-        return ""
     visited.add(base_url)
     print(f"🔗 Crawling: {base_url}")
@@ -43,6 +43,7 @@ def extract_links_and_text(base_url, max_depth=1, visited=None):
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
         links = set()
         for a in soup.find_all("a", href=True):
@@ -52,11 +53,11 @@ def extract_links_and_text(base_url, max_depth=1, visited=None):
                 links.add(full_url)
         for link in links:
-            page_text += "\n" + extract_links_and_text(link, max_depth=max_depth-1, visited=visited)
-        return page_text
     except Exception as e:
         print(f"❌ Failed to fetch {base_url}: {e}")
-        return ""
 # Save webpage content and index it
 def save_webpage(url, title):
@@ -67,13 +68,19 @@ def save_webpage(url, title):
     os.makedirs(folder, exist_ok=True)
     # Extract text from webpage and its linked pages
-    full_text = extract_links_and_text(url, max_depth=1)
-    if not full_text.strip():
         return "❌ No text extracted from the webpage."
     # Chunk text
-    chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
     # Embed and index
     embeddings = embedder.encode(chunks)
@@ -85,16 +92,16 @@ def save_webpage(url, title):
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
-    # Save index and chunks locally
     index_path = os.path.join(folder, "index.faiss")
-    chunks_path = os.path.join(folder, "chunks.pkl")
     faiss.write_index(index, index_path)
-    with open(chunks_path, "wb") as f:
-        pickle.dump(chunks, f)
     # Upload to hub
     upload_to_hub(index_path, f"data/{title}/index.faiss")
-    upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
     return f"✅ Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
@@ -113,24 +120,32 @@ def ask_question(message, history, selected_titles):
         folder = os.path.join(DATA_DIR, title)
         try:
             index = faiss.read_index(os.path.join(folder, "index.faiss"))
-            with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
-                chunks = pickle.load(f)
             q_embed = embedder.encode([message])
             D, I = index.search(q_embed, k=3)
-            context = "\n".join([chunks[i] for i in I[0]])
             response = llm.chat_completion(
                 messages=[
                     {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
-                    {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}"}
                 ],
                 model="deepseek-ai/DeepSeek-R1-0528",
                 max_tokens=2048,
             )
             response = response.choices[0].message["content"]
-            combined_answer += f"**{title}**:\n{response.strip()}\n\n"
         except Exception as e:
             combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"

 from huggingface_hub import InferenceClient, HfApi
 # Hugging Face Space persistence
+HF_REPO_ID = "MoslemBot/kajiweb"
 HF_API_TOKEN = os.getenv("HF_TOKEN")
 api = HfApi()
     if visited is None:
         visited = set()
     if base_url in visited or max_depth < 0:
+        return []
     visited.add(base_url)
     print(f"🔗 Crawling: {base_url}")
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
         page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
+        result = [(page_text, base_url)] if page_text.strip() else []
         links = set()
         for a in soup.find_all("a", href=True):
                 links.add(full_url)
         for link in links:
+            result.extend(extract_links_and_text(link, max_depth=max_depth-1, visited=visited))
+        return result
     except Exception as e:
         print(f"❌ Failed to fetch {base_url}: {e}")
+        return []
 # Save webpage content and index it
 def save_webpage(url, title):
     os.makedirs(folder, exist_ok=True)
     # Extract text from webpage and its linked pages
+    page_data = extract_links_and_text(url, max_depth=1)
+    if not page_data:
         return "❌ No text extracted from the webpage."
     # Chunk text
+    chunks = []
+    sources = []
+    for text, source_url in page_data:
+        for i in range(0, len(text), 500):
+            chunk = text[i:i+500]
+            chunks.append(chunk)
+            sources.append(source_url)
     # Embed and index
     embeddings = embedder.encode(chunks)
     index = faiss.IndexFlatL2(embeddings.shape[1])
     index.add(embeddings)
+    # Save index and metadata locally
     index_path = os.path.join(folder, "index.faiss")
+    meta_path = os.path.join(folder, "meta.pkl")
     faiss.write_index(index, index_path)
+    with open(meta_path, "wb") as f:
+        pickle.dump(list(zip(chunks, sources)), f)
     # Upload to hub
     upload_to_hub(index_path, f"data/{title}/index.faiss")
+    upload_to_hub(meta_path, f"data/{title}/meta.pkl")
     return f"✅ Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
         folder = os.path.join(DATA_DIR, title)
         try:
             index = faiss.read_index(os.path.join(folder, "index.faiss"))
+            with open(os.path.join(folder, "meta.pkl"), "rb") as f:
+                chunk_data = pickle.load(f)  # List of (chunk, url)
+            chunks = [cd[0] for cd in chunk_data]
+            urls = [cd[1] for cd in chunk_data]
             q_embed = embedder.encode([message])
             D, I = index.search(q_embed, k=3)
+            response_context = ""
+            sources_set = set()
+            for idx in I[0]:
+                response_context += f"[{urls[idx]}]\n{chunks[idx]}\n\n"
+                sources_set.add(urls[idx])
             response = llm.chat_completion(
                 messages=[
                     {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
+                    {"role": "user", "content": f"Context:\n{response_context}\n\nQuestion: {message}"}
                 ],
                 model="deepseek-ai/DeepSeek-R1-0528",
                 max_tokens=2048,
             )
             response = response.choices[0].message["content"]
+            combined_answer += f"**{title}** (sources: {', '.join(sources_set)}):\n{response.strip()}\n\n"
         except Exception as e:
             combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"