Shreyas094 commited on
Commit
9933931
·
verified ·
1 Parent(s): bdf60b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -27
app.py CHANGED
@@ -197,8 +197,10 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
197
  print(f"Found {len(result_block)} results on this page")
198
  for result in result_block:
199
  link = result.find("a", href=True)
200
- if link:
 
201
  link = link["href"]
 
202
  print(f"Processing link: {link}")
203
  try:
204
  webpage = session.get(link, headers=headers, timeout=timeout)
@@ -206,20 +208,21 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
206
  visible_text = extract_text_from_webpage(webpage.text)
207
  if len(visible_text) > max_chars_per_page:
208
  visible_text = visible_text[:max_chars_per_page] + "..."
209
- all_results.append({"link": link, "text": visible_text})
210
  print(f"Successfully extracted text from {link}")
211
  except requests.exceptions.RequestException as e:
212
  print(f"Error retrieving webpage content: {e}")
213
- all_results.append({"link": link, "text": None})
214
  else:
215
- print("No link found for this result")
216
- all_results.append({"link": None, "text": None})
217
  start += len(result_block)
218
 
219
  print(f"Search completed. Total results: {len(all_results)}")
220
  print("Search results:")
221
  for i, result in enumerate(all_results, 1):
222
  print(f"Result {i}:")
 
223
  print(f" Link: {result['link']}")
224
  if result['text']:
225
  print(f" Text: {result['text'][:100]}...") # Print first 100 characters
@@ -229,11 +232,14 @@ def google_search(term, num_results=20, lang="en", timeout=5, safe="active", ssl
229
 
230
  if not all_results:
231
  print("No search results found. Returning a default message.")
232
- return [{"link": None, "text": "No information found in the web search results."}]
233
 
234
  return all_results
235
 
236
  def summarize_content(content, model):
 
 
 
237
  # Approximate the token limit using character count
238
  # Assuming an average of 4 characters per token
239
  max_chars = 7000 * 4 # Leave some room for the prompt
@@ -282,32 +288,38 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search):
282
 
283
  if web_search:
284
  search_results = google_search(question)
285
- model = get_model(temperature, top_p, repetition_penalty)
286
 
287
- summaries = []
288
- for result in search_results:
289
- try:
290
- summary = summarize_content(result["text"], model)
291
- summaries.append(summary)
292
- except Exception as e:
293
- print(f"Error summarizing content: {str(e)}")
294
- summaries.append("Error: Unable to summarize this content.")
 
 
 
 
 
 
 
295
 
296
- # Combine summaries, ensuring we don't exceed the token limit
297
- combined_summaries = ""
298
- for summary in summaries:
299
- if len((combined_summaries + summary).split()) > 7000:
300
- break
301
- combined_summaries += summary + "\n\n"
302
-
303
- context_str = combined_summaries
304
- titles = [result["title"] for result in search_results]
305
  ranks = rank_search_results(titles, summaries, model)
306
 
307
- update_vector_db_with_search_results(search_results, summaries, ranks)
 
 
308
 
309
- context_str = "\n".join([f"Title: {result['title']}\nSummary: {summary}\nRank: {rank}"
310
- for result, summary, rank in zip(search_results, summaries, ranks)])
 
311
 
312
  prompt_template = """
313
  Answer the question based on the following web search results:
 
197
  print(f"Found {len(result_block)} results on this page")
198
  for result in result_block:
199
  link = result.find("a", href=True)
200
+ title = result.find("h3")
201
+ if link and title:
202
  link = link["href"]
203
+ title = title.get_text()
204
  print(f"Processing link: {link}")
205
  try:
206
  webpage = session.get(link, headers=headers, timeout=timeout)
 
208
  visible_text = extract_text_from_webpage(webpage.text)
209
  if len(visible_text) > max_chars_per_page:
210
  visible_text = visible_text[:max_chars_per_page] + "..."
211
+ all_results.append({"link": link, "title": title, "text": visible_text})
212
  print(f"Successfully extracted text from {link}")
213
  except requests.exceptions.RequestException as e:
214
  print(f"Error retrieving webpage content: {e}")
215
+ all_results.append({"link": link, "title": title, "text": None})
216
  else:
217
+ print("No link or title found for this result")
218
+ all_results.append({"link": None, "title": None, "text": None})
219
  start += len(result_block)
220
 
221
  print(f"Search completed. Total results: {len(all_results)}")
222
  print("Search results:")
223
  for i, result in enumerate(all_results, 1):
224
  print(f"Result {i}:")
225
+ print(f" Title: {result['title']}")
226
  print(f" Link: {result['link']}")
227
  if result['text']:
228
  print(f" Text: {result['text'][:100]}...") # Print first 100 characters
 
232
 
233
  if not all_results:
234
  print("No search results found. Returning a default message.")
235
+ return [{"link": None, "title": "No Results", "text": "No information found in the web search results."}]
236
 
237
  return all_results
238
 
239
  def summarize_content(content, model):
240
+ if content is None:
241
+ return "No content available to summarize."
242
+
243
  # Approximate the token limit using character count
244
  # Assuming an average of 4 characters per token
245
  max_chars = 7000 * 4 # Leave some room for the prompt
 
288
 
289
  if web_search:
290
  search_results = google_search(question)
 
291
 
292
+ processed_results = []
293
+ for index, result in enumerate(search_results, start=1):
294
+ if result["text"] is not None:
295
+ try:
296
+ summary = summarize_content(result["text"], model)
297
+ processed_results.append({
298
+ "title": result.get("title", f"Result {index}"),
299
+ "content": result["text"],
300
+ "summary": summary,
301
+ "index": index
302
+ })
303
+ except Exception as e:
304
+ print(f"Error processing search result {index}: {str(e)}")
305
+ else:
306
+ print(f"Skipping result {index} due to None content")
307
 
308
+ if not processed_results:
309
+ return "No valid search results found."
310
+
311
+ # Rank the results
312
+ titles = [r["title"] for r in processed_results]
313
+ summaries = [r["summary"] for r in processed_results]
 
 
 
314
  ranks = rank_search_results(titles, summaries, model)
315
 
316
+ # Update Vector DB
317
+ current_date = datetime.now().strftime("%Y-%m-%d")
318
+ update_vector_db_with_search_results(processed_results, ranks, current_date)
319
 
320
+ # Prepare context for the question
321
+ context_str = "\n\n".join([f"Title: {r['title']}\nSummary: {r['summary']}\nRank: {ranks[i]}"
322
+ for i, r in enumerate(processed_results)])
323
 
324
  prompt_template = """
325
  Answer the question based on the following web search results: