Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -197,8 +197,10 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
|
|
197 |
print(f"Found {len(result_block)} results on this page")
|
198 |
for result in result_block:
|
199 |
link = result.find("a", href=True)
|
200 |
-
|
|
|
201 |
link = link["href"]
|
|
|
202 |
print(f"Processing link: {link}")
|
203 |
try:
|
204 |
webpage = session.get(link, headers=headers, timeout=timeout)
|
@@ -206,20 +208,21 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
|
|
206 |
visible_text = extract_text_from_webpage(webpage.text)
|
207 |
if len(visible_text) > max_chars_per_page:
|
208 |
visible_text = visible_text[:max_chars_per_page] + "..."
|
209 |
-
all_results.append({"link": link, "text": visible_text})
|
210 |
print(f"Successfully extracted text from {link}")
|
211 |
except requests.exceptions.RequestException as e:
|
212 |
print(f"Error retrieving webpage content: {e}")
|
213 |
-
all_results.append({"link": link, "text": None})
|
214 |
else:
|
215 |
-
print("No link found for this result")
|
216 |
-
all_results.append({"link": None, "text": None})
|
217 |
start += len(result_block)
|
218 |
|
219 |
print(f"Search completed. Total results: {len(all_results)}")
|
220 |
print("Search results:")
|
221 |
for i, result in enumerate(all_results, 1):
|
222 |
print(f"Result {i}:")
|
|
|
223 |
print(f" Link: {result['link']}")
|
224 |
if result['text']:
|
225 |
print(f" Text: {result['text'][:100]}...") # Print first 100 characters
|
@@ -229,11 +232,14 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
|
|
229 |
|
230 |
if not all_results:
|
231 |
print("No search results found. Returning a default message.")
|
232 |
-
return [{"link": None, "text": "No information found in the web search results."}]
|
233 |
|
234 |
return all_results
|
235 |
|
236 |
def summarize_content(content, model):
|
|
|
|
|
|
|
237 |
# Approximate the token limit using character count
|
238 |
# Assuming an average of 4 characters per token
|
239 |
max_chars = 7000 * 4 # Leave some room for the prompt
|
@@ -282,32 +288,38 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
|
|
282 |
|
283 |
if web_search:
|
284 |
search_results = google_search(question)
|
285 |
-
model = get_model(temperature, top_p, repetition_penalty)
|
286 |
|
287 |
-
|
288 |
-
for result in search_results:
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
context_str = combined_summaries
|
304 |
-
titles = [result["title"] for result in search_results]
|
305 |
ranks = rank_search_results(titles, summaries, model)
|
306 |
|
307 |
-
|
|
|
|
|
308 |
|
309 |
-
|
310 |
-
|
|
|
311 |
|
312 |
prompt_template = """
|
313 |
Answer the question based on the following web search results:
|
|
|
197 |
print(f"Found {len(result_block)} results on this page")
|
198 |
for result in result_block:
|
199 |
link = result.find("a", href=True)
|
200 |
+
title = result.find("h3")
|
201 |
+
if link and title:
|
202 |
link = link["href"]
|
203 |
+
title = title.get_text()
|
204 |
print(f"Processing link: {link}")
|
205 |
try:
|
206 |
webpage = session.get(link, headers=headers, timeout=timeout)
|
|
|
208 |
visible_text = extract_text_from_webpage(webpage.text)
|
209 |
if len(visible_text) > max_chars_per_page:
|
210 |
visible_text = visible_text[:max_chars_per_page] + "..."
|
211 |
+
all_results.append({"link": link, "title": title, "text": visible_text})
|
212 |
print(f"Successfully extracted text from {link}")
|
213 |
except requests.exceptions.RequestException as e:
|
214 |
print(f"Error retrieving webpage content: {e}")
|
215 |
+
all_results.append({"link": link, "title": title, "text": None})
|
216 |
else:
|
217 |
+
print("No link or title found for this result")
|
218 |
+
all_results.append({"link": None, "title": None, "text": None})
|
219 |
start += len(result_block)
|
220 |
|
221 |
print(f"Search completed. Total results: {len(all_results)}")
|
222 |
print("Search results:")
|
223 |
for i, result in enumerate(all_results, 1):
|
224 |
print(f"Result {i}:")
|
225 |
+
print(f" Title: {result['title']}")
|
226 |
print(f" Link: {result['link']}")
|
227 |
if result['text']:
|
228 |
print(f" Text: {result['text'][:100]}...") # Print first 100 characters
|
|
|
232 |
|
233 |
if not all_results:
|
234 |
print("No search results found. Returning a default message.")
|
235 |
+
return [{"link": None, "title": "No Results", "text": "No information found in the web search results."}]
|
236 |
|
237 |
return all_results
|
238 |
|
239 |
def summarize_content(content, model):
|
240 |
+
if content is None:
|
241 |
+
return "No content available to summarize."
|
242 |
+
|
243 |
# Approximate the token limit using character count
|
244 |
# Assuming an average of 4 characters per token
|
245 |
max_chars = 7000 * 4 # Leave some room for the prompt
|
|
|
288 |
|
289 |
if web_search:
|
290 |
search_results = google_search(question)
|
|
|
291 |
|
292 |
+
processed_results = []
|
293 |
+
for index, result in enumerate(search_results, start=1):
|
294 |
+
if result["text"] is not None:
|
295 |
+
try:
|
296 |
+
summary = summarize_content(result["text"], model)
|
297 |
+
processed_results.append({
|
298 |
+
"title": result.get("title", f"Result {index}"),
|
299 |
+
"content": result["text"],
|
300 |
+
"summary": summary,
|
301 |
+
"index": index
|
302 |
+
})
|
303 |
+
except Exception as e:
|
304 |
+
print(f"Error processing search result {index}: {str(e)}")
|
305 |
+
else:
|
306 |
+
print(f"Skipping result {index} due to None content")
|
307 |
|
308 |
+
if not processed_results:
|
309 |
+
return "No valid search results found."
|
310 |
+
|
311 |
+
# Rank the results
|
312 |
+
titles = [r["title"] for r in processed_results]
|
313 |
+
summaries = [r["summary"] for r in processed_results]
|
|
|
|
|
|
|
314 |
ranks = rank_search_results(titles, summaries, model)
|
315 |
|
316 |
+
# Update Vector DB
|
317 |
+
current_date = datetime.now().strftime("%Y-%m-%d")
|
318 |
+
update_vector_db_with_search_results(processed_results, ranks, current_date)
|
319 |
|
320 |
+
# Prepare context for the question
|
321 |
+
context_str = "\n\n".join([f"Title: {r['title']}\nSummary: {r['summary']}\nRank: {ranks[i]}"
|
322 |
+
for i, r in enumerate(processed_results)])
|
323 |
|
324 |
prompt_template = """
|
325 |
Answer the question based on the following web search results:
|