Update app.py
Browse files
app.py
CHANGED
@@ -224,19 +224,31 @@ def google_news_search(term, num_results=5, lang="en", timeout=5, safe="active",
|
|
224 |
return all_results
|
225 |
|
226 |
def summarize_webpage(url, content, query, instructions, max_chars=1000):
|
227 |
-
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
# Format a prompt for this specific webpage
|
231 |
webpage_prompt = f"""
|
232 |
Instructions: {instructions}
|
233 |
Query: {query}
|
234 |
URL: {url}
|
235 |
|
236 |
-
|
237 |
-
{
|
238 |
|
239 |
-
|
240 |
|
241 |
Summary:
|
242 |
"""
|
@@ -248,11 +260,14 @@ def summarize_webpage(url, content, query, instructions, max_chars=1000):
|
|
248 |
if summary and len(summary) > max_chars:
|
249 |
summary = summary[:max_chars] + "..."
|
250 |
|
251 |
-
return summary
|
252 |
|
253 |
def preprocess_text(text):
|
|
|
|
|
|
|
254 |
# Remove HTML tags
|
255 |
-
text = BeautifulSoup(text, "html.parser").get_text()
|
256 |
|
257 |
# Remove URLs
|
258 |
text = re.sub(r'http\S+|www.\S+', '', text)
|
@@ -489,8 +504,12 @@ def scrape_and_display(query, num_results, instructions, web_search=True, use_ne
|
|
489 |
# Summarize each result
|
490 |
summarized_results = []
|
491 |
for result in search_results:
|
492 |
-
|
493 |
-
|
|
|
|
|
|
|
|
|
494 |
|
495 |
formatted_prompt = format_prompt(query, summarized_results, instructions)
|
496 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
|
|
224 |
return all_results
|
225 |
|
226 |
def summarize_webpage(url, content, query, instructions, max_chars=1000):
|
227 |
+
if content is None:
|
228 |
+
return f"Unable to fetch or process content from {url}"
|
229 |
+
|
230 |
+
# Extract keywords from the query
|
231 |
+
keywords = query.split()
|
232 |
+
|
233 |
+
# Apply full preprocessing pipeline
|
234 |
+
preprocessed_text = preprocess_text(content)
|
235 |
+
preprocessed_text = remove_boilerplate(preprocessed_text)
|
236 |
+
filtered_text = keyword_filter(preprocessed_text, keywords)
|
237 |
+
summarized_text = summarize_text(filtered_text, num_sentences=5) # Adjust num_sentences as needed
|
238 |
+
|
239 |
+
if not summarized_text:
|
240 |
+
return f"No relevant content found for the query in {url}"
|
241 |
+
|
242 |
# Format a prompt for this specific webpage
|
243 |
webpage_prompt = f"""
|
244 |
Instructions: {instructions}
|
245 |
Query: {query}
|
246 |
URL: {url}
|
247 |
|
248 |
+
Filtered and summarized webpage content:
|
249 |
+
{summarized_text}
|
250 |
|
251 |
+
Based on the above filtered and summarized content, provide a concise summary that's directly relevant to the query. Focus on specific data, facts, or insights mentioned. Keep the summary under 200 words.
|
252 |
|
253 |
Summary:
|
254 |
"""
|
|
|
260 |
if summary and len(summary) > max_chars:
|
261 |
summary = summary[:max_chars] + "..."
|
262 |
|
263 |
+
return summary if summary else f"Unable to generate summary for {url}"
|
264 |
|
265 |
def preprocess_text(text):
|
266 |
+
if text is None:
|
267 |
+
return "" # Return an empty string if input is None
|
268 |
+
|
269 |
# Remove HTML tags
|
270 |
+
text = BeautifulSoup(str(text), "html.parser").get_text()
|
271 |
|
272 |
# Remove URLs
|
273 |
text = re.sub(r'http\S+|www.\S+', '', text)
|
|
|
504 |
# Summarize each result
|
505 |
summarized_results = []
|
506 |
for result in search_results:
|
507 |
+
try:
|
508 |
+
summary = summarize_webpage(result['link'], result.get('text'), query, instructions)
|
509 |
+
summarized_results.append({"link": result['link'], "text": summary})
|
510 |
+
except Exception as e:
|
511 |
+
print(f"Error summarizing {result['link']}: {e}")
|
512 |
+
summarized_results.append({"link": result['link'], "text": f"Error summarizing content: {str(e)}"})
|
513 |
|
514 |
formatted_prompt = format_prompt(query, summarized_results, instructions)
|
515 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|