Shreyas094 commited on
Commit
898ed76
·
verified ·
1 Parent(s): ab3adb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -10
app.py CHANGED
@@ -224,19 +224,31 @@ def google_news_search(term, num_results=5, lang="en", timeout=5, safe="active",
224
  return all_results
225
 
226
  def summarize_webpage(url, content, query, instructions, max_chars=1000):
227
- # Preprocess the content
228
- preprocessed_text = preprocess_web_content(content, query.split())
229
-
 
 
 
 
 
 
 
 
 
 
 
 
230
  # Format a prompt for this specific webpage
231
  webpage_prompt = f"""
232
  Instructions: {instructions}
233
  Query: {query}
234
  URL: {url}
235
 
236
- Webpage content:
237
- {preprocessed_text}
238
 
239
- Summarize the above content in relation to the query. Focus on relevant information and include any specific data or facts mentioned. Keep the summary concise, ideally under 200 words.
240
 
241
  Summary:
242
  """
@@ -248,11 +260,14 @@ def summarize_webpage(url, content, query, instructions, max_chars=1000):
248
  if summary and len(summary) > max_chars:
249
  summary = summary[:max_chars] + "..."
250
 
251
- return summary
252
 
253
  def preprocess_text(text):
 
 
 
254
  # Remove HTML tags
255
- text = BeautifulSoup(text, "html.parser").get_text()
256
 
257
  # Remove URLs
258
  text = re.sub(r'http\S+|www.\S+', '', text)
@@ -489,8 +504,12 @@ def scrape_and_display(query, num_results, instructions, web_search=True, use_ne
489
  # Summarize each result
490
  summarized_results = []
491
  for result in search_results:
492
- summary = summarize_webpage(result['link'], result['text'], query, instructions)
493
- summarized_results.append({"link": result['link'], "text": summary})
 
 
 
 
494
 
495
  formatted_prompt = format_prompt(query, summarized_results, instructions)
496
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
 
224
  return all_results
225
 
226
  def summarize_webpage(url, content, query, instructions, max_chars=1000):
227
+ if content is None:
228
+ return f"Unable to fetch or process content from {url}"
229
+
230
+ # Extract keywords from the query
231
+ keywords = query.split()
232
+
233
+ # Apply full preprocessing pipeline
234
+ preprocessed_text = preprocess_text(content)
235
+ preprocessed_text = remove_boilerplate(preprocessed_text)
236
+ filtered_text = keyword_filter(preprocessed_text, keywords)
237
+ summarized_text = summarize_text(filtered_text, num_sentences=5) # Adjust num_sentences as needed
238
+
239
+ if not summarized_text:
240
+ return f"No relevant content found for the query in {url}"
241
+
242
  # Format a prompt for this specific webpage
243
  webpage_prompt = f"""
244
  Instructions: {instructions}
245
  Query: {query}
246
  URL: {url}
247
 
248
+ Filtered and summarized webpage content:
249
+ {summarized_text}
250
 
251
+ Based on the above filtered and summarized content, provide a concise summary that's directly relevant to the query. Focus on specific data, facts, or insights mentioned. Keep the summary under 200 words.
252
 
253
  Summary:
254
  """
 
260
  if summary and len(summary) > max_chars:
261
  summary = summary[:max_chars] + "..."
262
 
263
+ return summary if summary else f"Unable to generate summary for {url}"
264
 
265
  def preprocess_text(text):
266
+ if text is None:
267
+ return "" # Return an empty string if input is None
268
+
269
  # Remove HTML tags
270
+ text = BeautifulSoup(str(text), "html.parser").get_text()
271
 
272
  # Remove URLs
273
  text = re.sub(r'http\S+|www.\S+', '', text)
 
504
  # Summarize each result
505
  summarized_results = []
506
  for result in search_results:
507
+ try:
508
+ summary = summarize_webpage(result['link'], result.get('text'), query, instructions)
509
+ summarized_results.append({"link": result['link'], "text": summary})
510
+ except Exception as e:
511
+ print(f"Error summarizing {result['link']}: {e}")
512
+ summarized_results.append({"link": result['link'], "text": f"Error summarizing content: {str(e)}"})
513
 
514
  formatted_prompt = format_prompt(query, summarized_results, instructions)
515
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)