Shreyas094 commited on
Commit
ab3adb5
·
verified ·
1 Parent(s): 328806f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -10
app.py CHANGED
@@ -154,6 +154,75 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
154
  print(f"Total results fetched: {len(all_results)}")
155
  return all_results
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def summarize_webpage(url, content, query, instructions, max_chars=1000):
158
  # Preprocess the content
159
  preprocessed_text = preprocess_web_content(content, query.split())
@@ -409,28 +478,38 @@ def save_text_to_pdf(text, output_path):
409
  print("PDF saved successfully.")
410
 
411
  # Integrated function to perform web scraping, formatting, and text generation
412
- def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
413
  print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
414
  if web_search:
415
- search_results = google_search(query, num_results, instructions=instructions)
416
- formatted_prompt = format_prompt(query, search_results, instructions)
 
 
 
 
 
 
 
 
 
 
417
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
418
  else:
419
  formatted_prompt = format_prompt_with_instructions(query, instructions)
420
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
 
421
  print("Scraping and display complete.")
422
  if generated_summary:
423
- # Extract and return text starting from "Assistant:"
424
  assistant_index = generated_summary.find("Assistant:")
425
  if assistant_index != -1:
426
  generated_summary = generated_summary[assistant_index:]
427
  else:
428
  generated_summary = "Assistant: No response generated."
429
- print(f"Generated summary: {generated_summary}") # Debugging line
430
  return generated_summary
431
 
432
  # Main Gradio interface function
433
- def gradio_interface(query, use_dashboard, use_pdf, pdf, num_results, custom_instructions, temperature, repetition_penalty, top_p, clear_cache_flag):
434
  if clear_cache_flag:
435
  return clear_cache()
436
 
@@ -439,26 +518,27 @@ def gradio_interface(query, use_dashboard, use_pdf, pdf, num_results, custom_ins
439
  for query_type, query_info in PREDEFINED_QUERIES.items():
440
  formatted_query = query_info['query'].format(company=query)
441
  formatted_instructions = query_info['instructions'].format(company=query)
442
- result = scrape_and_display(formatted_query, num_results=num_results, instructions=formatted_instructions, web_search=True, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
443
  results.append(f"**{query_type}**\n\n{result}\n\n")
444
  generated_summary = "\n".join(results)
445
  elif use_pdf and pdf is not None:
446
  pdf_text = read_pdf(pdf)
447
  generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=custom_instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
448
  else:
449
- generated_summary = scrape_and_display(query, num_results=num_results, instructions=custom_instructions, web_search=True, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
450
 
451
  output_pdf_path = "output_summary.pdf"
452
  save_text_to_pdf(generated_summary, output_pdf_path)
453
 
454
  return generated_summary, output_pdf_path
455
 
456
- # Deploy Gradio Interface
457
  gr.Interface(
458
  fn=gradio_interface,
459
  inputs=[
460
  gr.Textbox(label="Company Name or Query"),
461
  gr.Checkbox(label="Use Dashboard"),
 
462
  gr.Checkbox(label="Use PDF"),
463
  gr.File(label="Upload PDF"),
464
  gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results"),
@@ -470,6 +550,6 @@ gr.Interface(
470
  ],
471
  outputs=["text", gr.File(label="Generated PDF")],
472
  title="Financial Analyst AI Assistant",
473
- description="Enter a company name to get a financial dashboard, or enter a custom query. Optionally, upload a PDF for analysis. Adjust parameters as needed for optimal results.",
474
  allow_flagging="never"
475
  ).launch(share=True)
 
154
  print(f"Total results fetched: {len(all_results)}")
155
  return all_results
156
 
157
+ def google_news_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
158
+ print(f"Searching Google News for term: {term}")
159
+ escaped_term = urllib.parse.quote_plus(term)
160
+ start = 0
161
+ all_results = []
162
+
163
+ with requests.Session() as session:
164
+ while len(all_results) < num_results:
165
+ try:
166
+ user_agent = random.choice(_useragent_list)
167
+ headers = {
168
+ 'User-Agent': user_agent
169
+ }
170
+ print(f"Using User-Agent: {headers['User-Agent']}")
171
+
172
+ resp = session.get(
173
+ url="https://news.google.com/search",
174
+ headers=headers,
175
+ params={
176
+ "q": term,
177
+ "hl": lang,
178
+ "gl": "US", # You can change this to target a specific country
179
+ "ceid": "US:en" # Change this according to your language and country
180
+ },
181
+ timeout=timeout,
182
+ verify=ssl_verify,
183
+ )
184
+ resp.raise_for_status()
185
+ except requests.exceptions.RequestException as e:
186
+ print(f"Error fetching search results: {e}")
187
+ break
188
+
189
+ soup = BeautifulSoup(resp.text, "html.parser")
190
+ articles = soup.find_all("article")
191
+
192
+ for article in articles:
193
+ if len(all_results) >= num_results:
194
+ break
195
+
196
+ link_element = article.find("a", class_="VDXfz")
197
+ if link_element:
198
+ # Google News uses relative URLs, so we need to construct the full URL
199
+ relative_link = link_element['href']
200
+ full_link = f"https://news.google.com{relative_link[1:]}" # Remove the leading '.'
201
+
202
+ title = link_element.text
203
+
204
+ try:
205
+ # Fetch the actual article
206
+ article_page = session.get(full_link, headers=headers, timeout=timeout)
207
+ article_page.raise_for_status()
208
+ article_content = extract_text_from_webpage(article_page.text)
209
+
210
+ all_results.append({"link": full_link, "title": title, "text": article_content})
211
+ except requests.exceptions.RequestException as e:
212
+ print(f"Error fetching or processing {full_link}: {e}")
213
+ all_results.append({"link": full_link, "title": title, "text": None})
214
+ else:
215
+ print("No link found in article.")
216
+
217
+ if len(articles) == 0:
218
+ print("No more results found.")
219
+ break
220
+
221
+ start += len(articles)
222
+
223
+ print(f"Total news results fetched: {len(all_results)}")
224
+ return all_results
225
+
226
  def summarize_webpage(url, content, query, instructions, max_chars=1000):
227
  # Preprocess the content
228
  preprocessed_text = preprocess_web_content(content, query.split())
 
478
  print("PDF saved successfully.")
479
 
480
  # Integrated function to perform web scraping, formatting, and text generation
481
+ def scrape_and_display(query, num_results, instructions, web_search=True, use_news=False, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
482
  print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
483
  if web_search:
484
+ if use_news:
485
+ search_results = google_news_search(query, num_results)
486
+ else:
487
+ search_results = google_search(query, num_results, instructions=instructions)
488
+
489
+ # Summarize each result
490
+ summarized_results = []
491
+ for result in search_results:
492
+ summary = summarize_webpage(result['link'], result['text'], query, instructions)
493
+ summarized_results.append({"link": result['link'], "text": summary})
494
+
495
+ formatted_prompt = format_prompt(query, summarized_results, instructions)
496
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
497
  else:
498
  formatted_prompt = format_prompt_with_instructions(query, instructions)
499
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
500
+
501
  print("Scraping and display complete.")
502
  if generated_summary:
 
503
  assistant_index = generated_summary.find("Assistant:")
504
  if assistant_index != -1:
505
  generated_summary = generated_summary[assistant_index:]
506
  else:
507
  generated_summary = "Assistant: No response generated."
508
+ print(f"Generated summary: {generated_summary}")
509
  return generated_summary
510
 
511
  # Main Gradio interface function
512
+ def gradio_interface(query, use_dashboard, use_news, use_pdf, pdf, num_results, custom_instructions, temperature, repetition_penalty, top_p, clear_cache_flag):
513
  if clear_cache_flag:
514
  return clear_cache()
515
 
 
518
  for query_type, query_info in PREDEFINED_QUERIES.items():
519
  formatted_query = query_info['query'].format(company=query)
520
  formatted_instructions = query_info['instructions'].format(company=query)
521
+ result = scrape_and_display(formatted_query, num_results=num_results, instructions=formatted_instructions, web_search=True, use_news=(query_type == "Recent News"), temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
522
  results.append(f"**{query_type}**\n\n{result}\n\n")
523
  generated_summary = "\n".join(results)
524
  elif use_pdf and pdf is not None:
525
  pdf_text = read_pdf(pdf)
526
  generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=custom_instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
527
  else:
528
+ generated_summary = scrape_and_display(query, num_results=num_results, instructions=custom_instructions, web_search=True, use_news=use_news, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
529
 
530
  output_pdf_path = "output_summary.pdf"
531
  save_text_to_pdf(generated_summary, output_pdf_path)
532
 
533
  return generated_summary, output_pdf_path
534
 
535
+ # Update the Gradio Interface
536
  gr.Interface(
537
  fn=gradio_interface,
538
  inputs=[
539
  gr.Textbox(label="Company Name or Query"),
540
  gr.Checkbox(label="Use Dashboard"),
541
+ gr.Checkbox(label="Use News Search"), # New checkbox for news search
542
  gr.Checkbox(label="Use PDF"),
543
  gr.File(label="Upload PDF"),
544
  gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results"),
 
550
  ],
551
  outputs=["text", gr.File(label="Generated PDF")],
552
  title="Financial Analyst AI Assistant",
553
+ description="Enter a company name to get a financial dashboard, or enter a custom query. Use the news search option for recent articles. Optionally, upload a PDF for analysis. Adjust parameters as needed for optimal results.",
554
  allow_flagging="never"
555
  ).launch(share=True)