Bofandra commited on
Commit
e3f9c03
Β·
verified Β·
1 Parent(s): 36a068b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import faiss
4
+ import pickle
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from urllib.parse import urljoin, urlparse
8
+ from sentence_transformers import SentenceTransformer
9
+ from huggingface_hub import InferenceClient, HfApi
10
+
11
+ # Hugging Face Space persistence
12
+ HF_REPO_ID = "MoslemBot/kajibuku" # e.g., "username/your-space-name"
13
+ HF_API_TOKEN = os.getenv("HF_TOKEN")
14
+ api = HfApi()
15
+
16
+ def upload_to_hub(local_path, remote_path):
17
+ api.upload_file(
18
+ path_or_fileobj=local_path,
19
+ path_in_repo=remote_path,
20
+ repo_id=HF_REPO_ID,
21
+ repo_type="space",
22
+ token=HF_API_TOKEN
23
+ )
24
+ print(f"βœ… Uploaded to Hub: {remote_path}")
25
+
26
+ # Initialize embedder and LLM client
27
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
28
+ llm = InferenceClient(token=os.getenv("HF_TOKEN"))
29
+
30
+ DATA_DIR = "data"
31
+ os.makedirs(DATA_DIR, exist_ok=True)
32
+
33
+ def extract_links_and_text(base_url, max_depth=1, visited=None):
34
+ if visited is None:
35
+ visited = set()
36
+ if base_url in visited or max_depth < 0:
37
+ return ""
38
+
39
+ visited.add(base_url)
40
+ print(f"πŸ”— Crawling: {base_url}")
41
+ try:
42
+ response = requests.get(base_url, timeout=10)
43
+ response.raise_for_status()
44
+ soup = BeautifulSoup(response.text, 'html.parser')
45
+ page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
46
+
47
+ links = set()
48
+ for a in soup.find_all("a", href=True):
49
+ href = a["href"]
50
+ full_url = urljoin(base_url, href)
51
+ if urlparse(full_url).netloc == urlparse(base_url).netloc:
52
+ links.add(full_url)
53
+
54
+ for link in links:
55
+ page_text += "\n" + extract_links_and_text(link, max_depth=max_depth-1, visited=visited)
56
+ return page_text
57
+ except Exception as e:
58
+ print(f"❌ Failed to fetch {base_url}: {e}")
59
+ return ""
60
+
61
+ # Save webpage content and index it
62
+ def save_webpage(url, title):
63
+ folder = os.path.join(DATA_DIR, title.strip())
64
+ if os.path.exists(folder):
65
+ return f"'{title}' already exists. Use a different title."
66
+
67
+ os.makedirs(folder, exist_ok=True)
68
+
69
+ # Extract text from webpage and its linked pages
70
+ full_text = extract_links_and_text(url, max_depth=1)
71
+
72
+ if not full_text.strip():
73
+ return "❌ No text extracted from the webpage."
74
+
75
+ # Chunk text
76
+ chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
77
+
78
+ # Embed and index
79
+ embeddings = embedder.encode(chunks)
80
+
81
+ print("Embeddings shape:", embeddings.shape)
82
+ if len(embeddings.shape) != 2:
83
+ raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")
84
+
85
+ index = faiss.IndexFlatL2(embeddings.shape[1])
86
+ index.add(embeddings)
87
+
88
+ # Save index and chunks locally
89
+ index_path = os.path.join(folder, "index.faiss")
90
+ chunks_path = os.path.join(folder, "chunks.pkl")
91
+ faiss.write_index(index, index_path)
92
+ with open(chunks_path, "wb") as f:
93
+ pickle.dump(chunks, f)
94
+
95
+ # Upload to hub
96
+ upload_to_hub(index_path, f"data/{title}/index.faiss")
97
+ upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
98
+
99
+ return f"βœ… Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
100
+
101
+ # Return all available webpage titles
102
+ def list_titles():
103
+ print(f"Listing in: {DATA_DIR} β†’ {os.listdir(DATA_DIR)}")
104
+ return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
105
+
106
+ # Ask question using selected webpages as context
107
+ def ask_question(message, history, selected_titles):
108
+ if not selected_titles:
109
+ return "❗ Please select at least one webpage."
110
+
111
+ combined_answer = ""
112
+ for title in selected_titles:
113
+ folder = os.path.join(DATA_DIR, title)
114
+ try:
115
+ index = faiss.read_index(os.path.join(folder, "index.faiss"))
116
+ with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
117
+ chunks = pickle.load(f)
118
+
119
+ q_embed = embedder.encode([message])
120
+ D, I = index.search(q_embed, k=3)
121
+ context = "\n".join([chunks[i] for i in I[0]])
122
+
123
+ response = llm.chat_completion(
124
+ messages=[
125
+ {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
126
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}"}
127
+ ],
128
+ model="deepseek-ai/DeepSeek-R1-0528",
129
+ max_tokens=2048,
130
+ )
131
+
132
+ response = response.choices[0].message["content"]
133
+ combined_answer += f"**{title}**:\n{response.strip()}\n\n"
134
+ except Exception as e:
135
+ combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"
136
+
137
+ return combined_answer.strip()
138
+
139
+ # Gradio UI
140
+ with gr.Blocks(css="body { background-color: white !important; }") as demo:
141
+ with gr.Tab("🌐 Index Web Page"):
142
+ url = gr.Textbox(label="Web Page URL")
143
+ title = gr.Textbox(label="Title for Web Page")
144
+ index_btn = gr.Button("Fetch and Index (with crawl)")
145
+ index_status = gr.Textbox(label="Status")
146
+ index_btn.click(fn=save_webpage, inputs=[url, title], outputs=index_status)
147
+
148
+ with gr.Tab("πŸ’¬ Chat with Web Pages"):
149
+ page_selector = gr.CheckboxGroup(label="Select Indexed Pages", choices=list_titles())
150
+ refresh_btn = gr.Button("πŸ”„ Refresh List")
151
+ refresh_btn.click(fn=list_titles, outputs=page_selector)
152
+ chat = gr.ChatInterface(fn=ask_question, additional_inputs=[page_selector])
153
+
154
+ demo.launch()