Spaces:

MoslemBot
/

KajiWeb

Running

App Files Files Community

Bofandra commited on Jul 13

Commit

e3f9c03

verified ·

1 Parent(s): 36a068b

Create app.py

Browse files

Files changed (1) hide show

app.py +154 -0

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import gradio as gr
+import faiss
+import pickle
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import InferenceClient, HfApi
+# Hugging Face Space persistence
+HF_REPO_ID = "MoslemBot/kajibuku"  # e.g., "username/your-space-name"
+HF_API_TOKEN = os.getenv("HF_TOKEN")
+api = HfApi()
+def upload_to_hub(local_path, remote_path):
+    api.upload_file(
+        path_or_fileobj=local_path,
+        path_in_repo=remote_path,
+        repo_id=HF_REPO_ID,
+        repo_type="space",
+        token=HF_API_TOKEN
+    )
+    print(f"✅ Uploaded to Hub: {remote_path}")
+# Initialize embedder and LLM client
+embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+llm = InferenceClient(token=os.getenv("HF_TOKEN"))
+DATA_DIR = "data"
+os.makedirs(DATA_DIR, exist_ok=True)
+def extract_links_and_text(base_url, max_depth=1, visited=None):
+    if visited is None:
+        visited = set()
+    if base_url in visited or max_depth < 0:
+        return ""
+    visited.add(base_url)
+    print(f"🔗 Crawling: {base_url}")
+    try:
+        response = requests.get(base_url, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
+        links = set()
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            full_url = urljoin(base_url, href)
+            if urlparse(full_url).netloc == urlparse(base_url).netloc:
+                links.add(full_url)
+        for link in links:
+            page_text += "\n" + extract_links_and_text(link, max_depth=max_depth-1, visited=visited)
+        return page_text
+    except Exception as e:
+        print(f"❌ Failed to fetch {base_url}: {e}")
+        return ""
+# Save webpage content and index it
+def save_webpage(url, title):
+    folder = os.path.join(DATA_DIR, title.strip())
+    if os.path.exists(folder):
+        return f"'{title}' already exists. Use a different title."
+    os.makedirs(folder, exist_ok=True)
+    # Extract text from webpage and its linked pages
+    full_text = extract_links_and_text(url, max_depth=1)
+    if not full_text.strip():
+        return "❌ No text extracted from the webpage."
+    # Chunk text
+    chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
+    # Embed and index
+    embeddings = embedder.encode(chunks)
+    print("Embeddings shape:", embeddings.shape)
+    if len(embeddings.shape) != 2:
+        raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    # Save index and chunks locally
+    index_path = os.path.join(folder, "index.faiss")
+    chunks_path = os.path.join(folder, "chunks.pkl")
+    faiss.write_index(index, index_path)
+    with open(chunks_path, "wb") as f:
+        pickle.dump(chunks, f)
+    # Upload to hub
+    upload_to_hub(index_path, f"data/{title}/index.faiss")
+    upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
+    return f"✅ Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
+# Return all available webpage titles
+def list_titles():
+    print(f"Listing in: {DATA_DIR} → {os.listdir(DATA_DIR)}")
+    return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
+# Ask question using selected webpages as context
+def ask_question(message, history, selected_titles):
+    if not selected_titles:
+        return "❗ Please select at least one webpage."
+    combined_answer = ""
+    for title in selected_titles:
+        folder = os.path.join(DATA_DIR, title)
+        try:
+            index = faiss.read_index(os.path.join(folder, "index.faiss"))
+            with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
+                chunks = pickle.load(f)
+            q_embed = embedder.encode([message])
+            D, I = index.search(q_embed, k=3)
+            context = "\n".join([chunks[i] for i in I[0]])
+            response = llm.chat_completion(
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
+                    {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}"}
+                ],
+                model="deepseek-ai/DeepSeek-R1-0528",
+                max_tokens=2048,
+            )
+            response = response.choices[0].message["content"]
+            combined_answer += f"**{title}**:\n{response.strip()}\n\n"
+        except Exception as e:
+            combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"
+    return combined_answer.strip()
+# Gradio UI
+with gr.Blocks(css="body { background-color: white !important; }") as demo:
+    with gr.Tab("🌐 Index Web Page"):
+        url = gr.Textbox(label="Web Page URL")
+        title = gr.Textbox(label="Title for Web Page")
+        index_btn = gr.Button("Fetch and Index (with crawl)")
+        index_status = gr.Textbox(label="Status")
+        index_btn.click(fn=save_webpage, inputs=[url, title], outputs=index_status)
+    with gr.Tab("💬 Chat with Web Pages"):
+        page_selector = gr.CheckboxGroup(label="Select Indexed Pages", choices=list_titles())
+        refresh_btn = gr.Button("🔄 Refresh List")
+        refresh_btn.click(fn=list_titles, outputs=page_selector)
+        chat = gr.ChatInterface(fn=ask_question, additional_inputs=[page_selector])
+demo.launch()