Bofandra commited on
Commit
11133cd
Β·
verified Β·
1 Parent(s): 9f8903a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -18
app.py CHANGED
@@ -9,7 +9,7 @@ from sentence_transformers import SentenceTransformer
9
  from huggingface_hub import InferenceClient, HfApi
10
 
11
  # Hugging Face Space persistence
12
- HF_REPO_ID = "MoslemBot/kajiweb" # e.g., "username/your-space-name"
13
  HF_API_TOKEN = os.getenv("HF_TOKEN")
14
  api = HfApi()
15
 
@@ -34,7 +34,7 @@ def extract_links_and_text(base_url, max_depth=1, visited=None):
34
  if visited is None:
35
  visited = set()
36
  if base_url in visited or max_depth < 0:
37
- return ""
38
 
39
  visited.add(base_url)
40
  print(f"πŸ”— Crawling: {base_url}")
@@ -43,6 +43,7 @@ def extract_links_and_text(base_url, max_depth=1, visited=None):
43
  response.raise_for_status()
44
  soup = BeautifulSoup(response.text, 'html.parser')
45
  page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
 
46
 
47
  links = set()
48
  for a in soup.find_all("a", href=True):
@@ -52,11 +53,11 @@ def extract_links_and_text(base_url, max_depth=1, visited=None):
52
  links.add(full_url)
53
 
54
  for link in links:
55
- page_text += "\n" + extract_links_and_text(link, max_depth=max_depth-1, visited=visited)
56
- return page_text
57
  except Exception as e:
58
  print(f"❌ Failed to fetch {base_url}: {e}")
59
- return ""
60
 
61
  # Save webpage content and index it
62
  def save_webpage(url, title):
@@ -67,13 +68,19 @@ def save_webpage(url, title):
67
  os.makedirs(folder, exist_ok=True)
68
 
69
  # Extract text from webpage and its linked pages
70
- full_text = extract_links_and_text(url, max_depth=1)
71
 
72
- if not full_text.strip():
73
  return "❌ No text extracted from the webpage."
74
 
75
  # Chunk text
76
- chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
 
 
 
 
 
 
77
 
78
  # Embed and index
79
  embeddings = embedder.encode(chunks)
@@ -85,16 +92,16 @@ def save_webpage(url, title):
85
  index = faiss.IndexFlatL2(embeddings.shape[1])
86
  index.add(embeddings)
87
 
88
- # Save index and chunks locally
89
  index_path = os.path.join(folder, "index.faiss")
90
- chunks_path = os.path.join(folder, "chunks.pkl")
91
  faiss.write_index(index, index_path)
92
- with open(chunks_path, "wb") as f:
93
- pickle.dump(chunks, f)
94
 
95
  # Upload to hub
96
  upload_to_hub(index_path, f"data/{title}/index.faiss")
97
- upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
98
 
99
  return f"βœ… Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
100
 
@@ -113,24 +120,32 @@ def ask_question(message, history, selected_titles):
113
  folder = os.path.join(DATA_DIR, title)
114
  try:
115
  index = faiss.read_index(os.path.join(folder, "index.faiss"))
116
- with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
117
- chunks = pickle.load(f)
 
 
 
118
 
119
  q_embed = embedder.encode([message])
120
  D, I = index.search(q_embed, k=3)
121
- context = "\n".join([chunks[i] for i in I[0]])
 
 
 
 
 
122
 
123
  response = llm.chat_completion(
124
  messages=[
125
  {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
126
- {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}"}
127
  ],
128
  model="deepseek-ai/DeepSeek-R1-0528",
129
  max_tokens=2048,
130
  )
131
 
132
  response = response.choices[0].message["content"]
133
- combined_answer += f"**{title}**:\n{response.strip()}\n\n"
134
  except Exception as e:
135
  combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"
136
 
 
9
  from huggingface_hub import InferenceClient, HfApi
10
 
11
  # Hugging Face Space persistence
12
+ HF_REPO_ID = "MoslemBot/kajiweb"
13
  HF_API_TOKEN = os.getenv("HF_TOKEN")
14
  api = HfApi()
15
 
 
34
  if visited is None:
35
  visited = set()
36
  if base_url in visited or max_depth < 0:
37
+ return []
38
 
39
  visited.add(base_url)
40
  print(f"πŸ”— Crawling: {base_url}")
 
43
  response.raise_for_status()
44
  soup = BeautifulSoup(response.text, 'html.parser')
45
  page_text = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
46
+ result = [(page_text, base_url)] if page_text.strip() else []
47
 
48
  links = set()
49
  for a in soup.find_all("a", href=True):
 
53
  links.add(full_url)
54
 
55
  for link in links:
56
+ result.extend(extract_links_and_text(link, max_depth=max_depth-1, visited=visited))
57
+ return result
58
  except Exception as e:
59
  print(f"❌ Failed to fetch {base_url}: {e}")
60
+ return []
61
 
62
  # Save webpage content and index it
63
  def save_webpage(url, title):
 
68
  os.makedirs(folder, exist_ok=True)
69
 
70
  # Extract text from webpage and its linked pages
71
+ page_data = extract_links_and_text(url, max_depth=1)
72
 
73
+ if not page_data:
74
  return "❌ No text extracted from the webpage."
75
 
76
  # Chunk text
77
+ chunks = []
78
+ sources = []
79
+ for text, source_url in page_data:
80
+ for i in range(0, len(text), 500):
81
+ chunk = text[i:i+500]
82
+ chunks.append(chunk)
83
+ sources.append(source_url)
84
 
85
  # Embed and index
86
  embeddings = embedder.encode(chunks)
 
92
  index = faiss.IndexFlatL2(embeddings.shape[1])
93
  index.add(embeddings)
94
 
95
+ # Save index and metadata locally
96
  index_path = os.path.join(folder, "index.faiss")
97
+ meta_path = os.path.join(folder, "meta.pkl")
98
  faiss.write_index(index, index_path)
99
+ with open(meta_path, "wb") as f:
100
+ pickle.dump(list(zip(chunks, sources)), f)
101
 
102
  # Upload to hub
103
  upload_to_hub(index_path, f"data/{title}/index.faiss")
104
+ upload_to_hub(meta_path, f"data/{title}/meta.pkl")
105
 
106
  return f"βœ… Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
107
 
 
120
  folder = os.path.join(DATA_DIR, title)
121
  try:
122
  index = faiss.read_index(os.path.join(folder, "index.faiss"))
123
+ with open(os.path.join(folder, "meta.pkl"), "rb") as f:
124
+ chunk_data = pickle.load(f) # List of (chunk, url)
125
+
126
+ chunks = [cd[0] for cd in chunk_data]
127
+ urls = [cd[1] for cd in chunk_data]
128
 
129
  q_embed = embedder.encode([message])
130
  D, I = index.search(q_embed, k=3)
131
+
132
+ response_context = ""
133
+ sources_set = set()
134
+ for idx in I[0]:
135
+ response_context += f"[{urls[idx]}]\n{chunks[idx]}\n\n"
136
+ sources_set.add(urls[idx])
137
 
138
  response = llm.chat_completion(
139
  messages=[
140
  {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
141
+ {"role": "user", "content": f"Context:\n{response_context}\n\nQuestion: {message}"}
142
  ],
143
  model="deepseek-ai/DeepSeek-R1-0528",
144
  max_tokens=2048,
145
  )
146
 
147
  response = response.choices[0].message["content"]
148
+ combined_answer += f"**{title}** (sources: {', '.join(sources_set)}):\n{response.strip()}\n\n"
149
  except Exception as e:
150
  combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"
151