Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -154,6 +154,75 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
|
|
154 |
print(f"Total results fetched: {len(all_results)}")
|
155 |
return all_results
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
def summarize_webpage(url, content, query, instructions, max_chars=1000):
|
158 |
# Preprocess the content
|
159 |
preprocessed_text = preprocess_web_content(content, query.split())
|
@@ -409,28 +478,38 @@ def save_text_to_pdf(text, output_path):
|
|
409 |
print("PDF saved successfully.")
|
410 |
|
411 |
# Integrated function to perform web scraping, formatting, and text generation
|
412 |
-
def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
|
413 |
print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
|
414 |
if web_search:
|
415 |
-
|
416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
417 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
418 |
else:
|
419 |
formatted_prompt = format_prompt_with_instructions(query, instructions)
|
420 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
|
|
421 |
print("Scraping and display complete.")
|
422 |
if generated_summary:
|
423 |
-
# Extract and return text starting from "Assistant:"
|
424 |
assistant_index = generated_summary.find("Assistant:")
|
425 |
if assistant_index != -1:
|
426 |
generated_summary = generated_summary[assistant_index:]
|
427 |
else:
|
428 |
generated_summary = "Assistant: No response generated."
|
429 |
-
print(f"Generated summary: {generated_summary}")
|
430 |
return generated_summary
|
431 |
|
432 |
# Main Gradio interface function
|
433 |
-
def gradio_interface(query, use_dashboard, use_pdf, pdf, num_results, custom_instructions, temperature, repetition_penalty, top_p, clear_cache_flag):
|
434 |
if clear_cache_flag:
|
435 |
return clear_cache()
|
436 |
|
@@ -439,26 +518,27 @@ def gradio_interface(query, use_dashboard, use_pdf, pdf, num_results, custom_ins
|
|
439 |
for query_type, query_info in PREDEFINED_QUERIES.items():
|
440 |
formatted_query = query_info['query'].format(company=query)
|
441 |
formatted_instructions = query_info['instructions'].format(company=query)
|
442 |
-
result = scrape_and_display(formatted_query, num_results=num_results, instructions=formatted_instructions, web_search=True, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
443 |
results.append(f"**{query_type}**\n\n{result}\n\n")
|
444 |
generated_summary = "\n".join(results)
|
445 |
elif use_pdf and pdf is not None:
|
446 |
pdf_text = read_pdf(pdf)
|
447 |
generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=custom_instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
448 |
else:
|
449 |
-
generated_summary = scrape_and_display(query, num_results=num_results, instructions=custom_instructions, web_search=True, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
450 |
|
451 |
output_pdf_path = "output_summary.pdf"
|
452 |
save_text_to_pdf(generated_summary, output_pdf_path)
|
453 |
|
454 |
return generated_summary, output_pdf_path
|
455 |
|
456 |
-
#
|
457 |
gr.Interface(
|
458 |
fn=gradio_interface,
|
459 |
inputs=[
|
460 |
gr.Textbox(label="Company Name or Query"),
|
461 |
gr.Checkbox(label="Use Dashboard"),
|
|
|
462 |
gr.Checkbox(label="Use PDF"),
|
463 |
gr.File(label="Upload PDF"),
|
464 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results"),
|
@@ -470,6 +550,6 @@ gr.Interface(
|
|
470 |
],
|
471 |
outputs=["text", gr.File(label="Generated PDF")],
|
472 |
title="Financial Analyst AI Assistant",
|
473 |
-
description="Enter a company name to get a financial dashboard, or enter a custom query. Optionally, upload a PDF for analysis. Adjust parameters as needed for optimal results.",
|
474 |
allow_flagging="never"
|
475 |
).launch(share=True)
|
|
|
154 |
print(f"Total results fetched: {len(all_results)}")
|
155 |
return all_results
|
156 |
|
157 |
+
def google_news_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
|
158 |
+
print(f"Searching Google News for term: {term}")
|
159 |
+
escaped_term = urllib.parse.quote_plus(term)
|
160 |
+
start = 0
|
161 |
+
all_results = []
|
162 |
+
|
163 |
+
with requests.Session() as session:
|
164 |
+
while len(all_results) < num_results:
|
165 |
+
try:
|
166 |
+
user_agent = random.choice(_useragent_list)
|
167 |
+
headers = {
|
168 |
+
'User-Agent': user_agent
|
169 |
+
}
|
170 |
+
print(f"Using User-Agent: {headers['User-Agent']}")
|
171 |
+
|
172 |
+
resp = session.get(
|
173 |
+
url="https://news.google.com/search",
|
174 |
+
headers=headers,
|
175 |
+
params={
|
176 |
+
"q": term,
|
177 |
+
"hl": lang,
|
178 |
+
"gl": "US", # You can change this to target a specific country
|
179 |
+
"ceid": "US:en" # Change this according to your language and country
|
180 |
+
},
|
181 |
+
timeout=timeout,
|
182 |
+
verify=ssl_verify,
|
183 |
+
)
|
184 |
+
resp.raise_for_status()
|
185 |
+
except requests.exceptions.RequestException as e:
|
186 |
+
print(f"Error fetching search results: {e}")
|
187 |
+
break
|
188 |
+
|
189 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
190 |
+
articles = soup.find_all("article")
|
191 |
+
|
192 |
+
for article in articles:
|
193 |
+
if len(all_results) >= num_results:
|
194 |
+
break
|
195 |
+
|
196 |
+
link_element = article.find("a", class_="VDXfz")
|
197 |
+
if link_element:
|
198 |
+
# Google News uses relative URLs, so we need to construct the full URL
|
199 |
+
relative_link = link_element['href']
|
200 |
+
full_link = f"https://news.google.com{relative_link[1:]}" # Remove the leading '.'
|
201 |
+
|
202 |
+
title = link_element.text
|
203 |
+
|
204 |
+
try:
|
205 |
+
# Fetch the actual article
|
206 |
+
article_page = session.get(full_link, headers=headers, timeout=timeout)
|
207 |
+
article_page.raise_for_status()
|
208 |
+
article_content = extract_text_from_webpage(article_page.text)
|
209 |
+
|
210 |
+
all_results.append({"link": full_link, "title": title, "text": article_content})
|
211 |
+
except requests.exceptions.RequestException as e:
|
212 |
+
print(f"Error fetching or processing {full_link}: {e}")
|
213 |
+
all_results.append({"link": full_link, "title": title, "text": None})
|
214 |
+
else:
|
215 |
+
print("No link found in article.")
|
216 |
+
|
217 |
+
if len(articles) == 0:
|
218 |
+
print("No more results found.")
|
219 |
+
break
|
220 |
+
|
221 |
+
start += len(articles)
|
222 |
+
|
223 |
+
print(f"Total news results fetched: {len(all_results)}")
|
224 |
+
return all_results
|
225 |
+
|
226 |
def summarize_webpage(url, content, query, instructions, max_chars=1000):
|
227 |
# Preprocess the content
|
228 |
preprocessed_text = preprocess_web_content(content, query.split())
|
|
|
478 |
print("PDF saved successfully.")
|
479 |
|
480 |
# Integrated function to perform web scraping, formatting, and text generation
|
481 |
+
def scrape_and_display(query, num_results, instructions, web_search=True, use_news=False, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
|
482 |
print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
|
483 |
if web_search:
|
484 |
+
if use_news:
|
485 |
+
search_results = google_news_search(query, num_results)
|
486 |
+
else:
|
487 |
+
search_results = google_search(query, num_results, instructions=instructions)
|
488 |
+
|
489 |
+
# Summarize each result
|
490 |
+
summarized_results = []
|
491 |
+
for result in search_results:
|
492 |
+
summary = summarize_webpage(result['link'], result['text'], query, instructions)
|
493 |
+
summarized_results.append({"link": result['link'], "text": summary})
|
494 |
+
|
495 |
+
formatted_prompt = format_prompt(query, summarized_results, instructions)
|
496 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
497 |
else:
|
498 |
formatted_prompt = format_prompt_with_instructions(query, instructions)
|
499 |
generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
500 |
+
|
501 |
print("Scraping and display complete.")
|
502 |
if generated_summary:
|
|
|
503 |
assistant_index = generated_summary.find("Assistant:")
|
504 |
if assistant_index != -1:
|
505 |
generated_summary = generated_summary[assistant_index:]
|
506 |
else:
|
507 |
generated_summary = "Assistant: No response generated."
|
508 |
+
print(f"Generated summary: {generated_summary}")
|
509 |
return generated_summary
|
510 |
|
511 |
# Main Gradio interface function
|
512 |
+
def gradio_interface(query, use_dashboard, use_news, use_pdf, pdf, num_results, custom_instructions, temperature, repetition_penalty, top_p, clear_cache_flag):
|
513 |
if clear_cache_flag:
|
514 |
return clear_cache()
|
515 |
|
|
|
518 |
for query_type, query_info in PREDEFINED_QUERIES.items():
|
519 |
formatted_query = query_info['query'].format(company=query)
|
520 |
formatted_instructions = query_info['instructions'].format(company=query)
|
521 |
+
result = scrape_and_display(formatted_query, num_results=num_results, instructions=formatted_instructions, web_search=True, use_news=(query_type == "Recent News"), temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
522 |
results.append(f"**{query_type}**\n\n{result}\n\n")
|
523 |
generated_summary = "\n".join(results)
|
524 |
elif use_pdf and pdf is not None:
|
525 |
pdf_text = read_pdf(pdf)
|
526 |
generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=custom_instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
527 |
else:
|
528 |
+
generated_summary = scrape_and_display(query, num_results=num_results, instructions=custom_instructions, web_search=True, use_news=use_news, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
|
529 |
|
530 |
output_pdf_path = "output_summary.pdf"
|
531 |
save_text_to_pdf(generated_summary, output_pdf_path)
|
532 |
|
533 |
return generated_summary, output_pdf_path
|
534 |
|
535 |
+
# Update the Gradio Interface
|
536 |
gr.Interface(
|
537 |
fn=gradio_interface,
|
538 |
inputs=[
|
539 |
gr.Textbox(label="Company Name or Query"),
|
540 |
gr.Checkbox(label="Use Dashboard"),
|
541 |
+
gr.Checkbox(label="Use News Search"), # New checkbox for news search
|
542 |
gr.Checkbox(label="Use PDF"),
|
543 |
gr.File(label="Upload PDF"),
|
544 |
gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results"),
|
|
|
550 |
],
|
551 |
outputs=["text", gr.File(label="Generated PDF")],
|
552 |
title="Financial Analyst AI Assistant",
|
553 |
+
description="Enter a company name to get a financial dashboard, or enter a custom query. Use the news search option for recent articles. Optionally, upload a PDF for analysis. Adjust parameters as needed for optimal results.",
|
554 |
allow_flagging="never"
|
555 |
).launch(share=True)
|