Shreyas094 commited on
Commit
10e4113
·
verified ·
1 Parent(s): 76dfeb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -499
app.py CHANGED
@@ -1,67 +1,13 @@
1
- import fitz # PyMuPDF
2
- import gradio as gr
3
  import requests
4
  from bs4 import BeautifulSoup
5
- import urllib.parse
 
6
  import random
7
- import os
8
- from dotenv import load_dotenv
9
- import shutil
10
- import tempfile
11
- import re
12
- import unicodedata
13
- from nltk.corpus import stopwords
14
- from nltk.tokenize import sent_tokenize, word_tokenize
15
- from nltk.probability import FreqDist
16
- import nltk
17
  from datetime import datetime, timedelta
 
18
 
19
- # Download necessary NLTK data
20
- nltk.download('punkt')
21
- nltk.download('stopwords')
22
-
23
- load_dotenv() # Load environment variables from .env file
24
-
25
- # Now replace the hard-coded token with the environment variable
26
- HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
27
-
28
- def clear_cache():
29
- try:
30
- # Clear Gradio cache
31
- cache_dir = tempfile.gettempdir()
32
- shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
33
-
34
- # Clear any custom cache you might have
35
- # For example, if you're caching PDF files or search results:
36
- if os.path.exists("output_summary.pdf"):
37
- os.remove("output_summary.pdf")
38
-
39
- # Add any other cache clearing operations here
40
-
41
- print("Cache cleared successfully.")
42
- return "Cache cleared successfully."
43
- except Exception as e:
44
- print(f"Error clearing cache: {e}")
45
- return f"Error clearing cache: {e}"
46
-
47
- PREDEFINED_QUERIES = {
48
- "Recent Earnings": {
49
- "query": "{company} recent quarterly earnings",
50
- "instructions": "Provide the most recent quarterly earnings data for {company}. Include revenue, net income, loan growth, deposit growth if any, EPS and asset quality. Specify the exact quarter and year."
51
- },
52
- "Recent News": {
53
- "query": "{company} recent news",
54
- "instructions": "Summarize the most recent significant news about {company}. Focus on events that could impact the company's financial performance or stock price."
55
- },
56
- "Credit Rating": {
57
- "query": "{company} current credit rating",
58
- "instructions": "Provide the most recent credit rating for {company}. Include the rating agency, the exact rating, and the date it was issued or last confirmed."
59
- },
60
- "Earnings Call Transcript": {
61
- "query": "{company} most recent earnings call transcript",
62
- "instructions": "Summarize key points from {company}'s most recent earnings call. Include date of the call, major financial highlights, and any significant forward-looking statements."
63
- }
64
- }
65
  _useragent_list = [
66
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
67
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
@@ -71,21 +17,16 @@ _useragent_list = [
71
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
72
  ]
73
 
74
- # Function to extract visible text from HTML content of a webpage
75
- def extract_text_from_webpage(html):
76
- print("Extracting text from webpage...")
77
- soup = BeautifulSoup(html, 'html.parser')
78
- for script in soup(["script", "style"]):
79
- script.extract() # Remove scripts and styles
80
- text = soup.get_text()
81
- lines = (line.strip() for line in text.splitlines())
82
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
83
- text = '\n'.join(chunk for chunk in chunks if chunk)
84
- print(f"Extracted text length: {len(text)}")
85
- return text
86
 
87
- # Function to perform a Google search and retrieve results
88
- def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, instructions="", days_back=365):
89
  print(f"Searching for term: {term}")
90
 
91
  # Calculate the date range
@@ -97,7 +38,7 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
97
  end_date_str = end_date.strftime("%Y-%m-%d")
98
 
99
  # Add the date range to the search term
100
- search_term = f"{term} after:{start_date_str} before:{end_date_str}"
101
 
102
  escaped_term = urllib.parse.quote_plus(search_term)
103
  start = 0
@@ -105,14 +46,10 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
105
 
106
  with requests.Session() as session:
107
  while len(all_results) < num_results:
108
- print(f"Fetching search results starting from: {start}")
109
  try:
110
  # Choose a random user agent
111
  user_agent = random.choice(_useragent_list)
112
- headers = {
113
- 'User-Agent': user_agent
114
- }
115
- print(f"Using User-Agent: {headers['User-Agent']}")
116
 
117
  resp = session.get(
118
  url="https://www.google.com/search",
@@ -137,7 +74,6 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
137
  if not result_block:
138
  print("No more results found.")
139
  break
140
- keywords = term.split() # Use the search term as keywords for filtering
141
 
142
  for result in result_block:
143
  if len(all_results) >= num_results:
@@ -151,10 +87,7 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
151
  webpage.raise_for_status()
152
  visible_text = extract_text_from_webpage(webpage.text)
153
 
154
- # Summarize the webpage content
155
- summary = summarize_webpage(link, visible_text, term, instructions)
156
-
157
- all_results.append({"link": link, "text": summary})
158
  except requests.exceptions.RequestException as e:
159
  print(f"Error fetching or processing {link}: {e}")
160
  all_results.append({"link": link, "text": None})
@@ -167,440 +100,74 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
167
  print(f"Total results fetched: {len(all_results)}")
168
  return all_results
169
 
170
- def google_news_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=30):
171
- print(f"Searching Google News for term: {term}")
172
-
173
- # Calculate the date range
174
- end_date = datetime.now()
175
- start_date = end_date - timedelta(days=days_back)
176
 
177
- # Format dates as strings
178
- start_date_str = start_date.strftime("%Y-%m-%d")
179
- end_date_str = end_date.strftime("%Y-%m-%d")
180
-
181
- # Add the date range to the search term
182
- search_term = f"{term} after:{start_date_str} before:{end_date_str}"
183
-
184
- escaped_term = urllib.parse.quote_plus(search_term)
185
- start = 0
186
- all_results = []
187
-
188
- with requests.Session() as session:
189
- while len(all_results) < num_results:
190
- try:
191
- user_agent = random.choice(_useragent_list)
192
- headers = {
193
- 'User-Agent': user_agent
194
- }
195
- print(f"Using User-Agent: {headers['User-Agent']}")
196
-
197
- resp = session.get(
198
- url="https://news.google.com/search",
199
- headers=headers,
200
- params={
201
- "q": search_term,
202
- "hl": lang,
203
- "gl": "US",
204
- "ceid": "US:en"
205
- },
206
- timeout=timeout,
207
- verify=ssl_verify,
208
- )
209
- resp.raise_for_status()
210
- except requests.exceptions.RequestException as e:
211
- print(f"Error fetching search results: {e}")
212
- break
213
-
214
- soup = BeautifulSoup(resp.text, "html.parser")
215
- articles = soup.find_all("article")
216
-
217
- for article in articles:
218
- if len(all_results) >= num_results:
219
- break
220
- link_element = article.find("a", attrs={"class": "WwrzSb"}) or article.find("a", href=True)
221
- # link_element = article.find("a", class_="WwrzSb")
222
- if link_element:
223
- # Google News uses relative URLs, so we need to construct the full URL
224
- relative_link = link_element['href']
225
- full_link = f"https://news.google.com{relative_link[1:]}" # Remove the leading '.'
226
-
227
- title = link_element.text
228
-
229
- try:
230
- # Fetch the actual article
231
- article_page = session.get(full_link, headers=headers, timeout=timeout)
232
- article_page.raise_for_status()
233
- article_content = extract_text_from_webpage(article_page.text)
234
-
235
- all_results.append({"link": full_link, "title": title, "text": article_content})
236
- except requests.exceptions.RequestException as e:
237
- print(f"Error fetching or processing {full_link}: {e}")
238
- all_results.append({"link": full_link, "title": title, "text": None})
239
- else:
240
- print("No link found in article.")
241
-
242
- if len(articles) == 0:
243
- print("No more results found.")
244
- break
245
-
246
- start += len(articles)
247
-
248
- print(f"Total news results fetched: {len(all_results)}")
249
- return all_results
250
-
251
- def summarize_webpage(url, content, query, instructions, max_chars=1000):
252
- if content is None:
253
- return f"Unable to fetch or process content from {url}"
254
-
255
- # Extract keywords from the query
256
- keywords = query.split()
257
-
258
- # Apply full preprocessing pipeline
259
- preprocessed_text = preprocess_text(content)
260
- preprocessed_text = remove_boilerplate(preprocessed_text)
261
- filtered_text = keyword_filter(preprocessed_text, keywords)
262
- summarized_text = summarize_text(filtered_text, num_sentences=5) # Adjust num_sentences as needed
263
-
264
- if not summarized_text:
265
- return f"No relevant content found for the query in {url}"
266
-
267
- # Format a prompt for this specific webpage
268
- webpage_prompt = f"""
269
- Instructions: {instructions}
270
- Query: {query}
271
- URL: {url}
272
-
273
- Filtered and summarized webpage content:
274
- {summarized_text}
275
-
276
- Based on the above filtered and summarized content, provide a concise summary that's directly relevant to the query. Focus on specific data, facts, or insights mentioned. Keep the summary under 200 words.
277
-
278
- Summary:
279
- """
280
-
281
- # Generate summary using the AI model
282
- summary = generate_text(webpage_prompt, temperature=0.3, repetition_penalty=1.2, top_p=0.9)
283
-
284
- # Truncate if necessary
285
- if summary and len(summary) > max_chars:
286
- summary = summary[:max_chars] + "..."
287
-
288
- return summary if summary else f"Unable to generate summary for {url}"
289
-
290
- def preprocess_text(text):
291
- if text is None:
292
- return "" # Return an empty string if input is None
293
-
294
- # Remove HTML tags
295
- text = BeautifulSoup(str(text), "html.parser").get_text()
296
-
297
- # Remove URLs
298
- text = re.sub(r'http\S+|www.\S+', '', text)
299
-
300
- # Remove special characters and digits
301
- text = re.sub(r'[^a-zA-Z\s]', '', text)
302
 
303
- # Remove extra whitespace
304
- text = ' '.join(text.split())
305
 
306
- # Convert to lowercase
307
- text = text.lower()
308
 
309
- return text
310
-
311
- def remove_boilerplate(text):
312
- # List of common boilerplate phrases to remove
313
- boilerplate = [
314
- "all rights reserved",
315
- "terms of service",
316
- "privacy policy",
317
- "cookie policy",
318
- "copyright ©",
319
- "follow us on social media"
320
- ]
321
 
322
- for phrase in boilerplate:
323
- text = text.replace(phrase, '')
324
 
325
  return text
326
 
327
- def keyword_filter(text, keywords):
328
- sentences = sent_tokenize(text)
329
- filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
330
- return ' '.join(filtered_sentences)
331
-
332
- def summarize_text(text, num_sentences=3):
333
- # Tokenize the text into words
334
- words = word_tokenize(text)
335
 
336
- # Remove stopwords
337
- stop_words = set(stopwords.words('english'))
338
- words = [word for word in words if word.lower() not in stop_words]
339
-
340
- # Calculate word frequencies
341
- freq_dist = FreqDist(words)
342
-
343
- # Score sentences based on word frequencies
344
- sentences = sent_tokenize(text)
345
- sentence_scores = {}
346
- for sentence in sentences:
347
- for word in word_tokenize(sentence.lower()):
348
- if word in freq_dist:
349
- if sentence not in sentence_scores:
350
- sentence_scores[sentence] = freq_dist[word]
351
- else:
352
- sentence_scores[sentence] += freq_dist[word]
353
 
354
- # Get the top N sentences with highest scores
355
- summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
356
 
357
- # Sort the selected sentences in the order they appear in the original text
358
- summary_sentences = sorted(summary_sentences, key=text.index)
359
 
360
- return ' '.join(summary_sentences)
361
 
362
- def preprocess_web_content(content, keywords):
363
- # Apply basic preprocessing
364
- preprocessed_text = preprocess_text(content)
365
 
366
- # Remove boilerplate
367
- preprocessed_text = remove_boilerplate(preprocessed_text)
368
-
369
- # Apply keyword filtering
370
- filtered_text = keyword_filter(preprocessed_text, keywords)
371
 
372
- # Summarize the text
373
- summarized_text = summarize_text(filtered_text)
374
 
375
- return summarized_text
376
-
377
-
378
- # Function to format the prompt for the Hugging Face API
379
- def format_prompt(query, search_results, instructions):
380
- formatted_results = ""
381
- for result in search_results:
382
- link = result["link"]
383
- summary = result["text"]
384
- if link and summary:
385
- formatted_results += f"URL: {link}\nSummary: {summary}\n{'-' * 80}\n"
386
- else:
387
- formatted_results += "No relevant information found.\n" + '-' * 80 + '\n'
388
-
389
- prompt = f"""Instructions: {instructions}
390
- User Query: {query}
391
-
392
- Summarized Web Search Results:
393
- {formatted_results}
394
-
395
- Based on the above summarized information from multiple sources, provide a comprehensive and factual response to the user's query. Include specific dates, numbers, and sources where available. If information is conflicting or unclear, mention this in your response. Do not make assumptions or provide information that is not supported by the summaries.
396
-
397
- Assistant:"""
398
- return prompt
399
-
400
- # Function to generate text using Hugging Face API
401
- def generate_text(input_text, temperature=0.3, repetition_penalty=1.2, top_p=0.9):
402
- print("Generating text using Hugging Face API...")
403
- endpoint = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
404
- headers = {
405
- "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}",
406
- "Content-Type": "application/json"
407
- }
408
- data = {
409
- "inputs": input_text,
410
- "parameters": {
411
- "max_new_tokens": 1000, # Reduced to focus on more concise answers
412
- "temperature": temperature,
413
- "repetition_penalty": repetition_penalty,
414
- "top_p": top_p,
415
- "do_sample": True
416
- }
417
- }
418
-
419
- try:
420
- response = requests.post(endpoint, headers=headers, json=data)
421
- response.raise_for_status()
422
-
423
- # Check if response is JSON
424
- try:
425
- json_data = response.json()
426
- except ValueError:
427
- print("Response is not JSON.")
428
- return None
429
-
430
- # Extract generated text from response JSON
431
- if isinstance(json_data, list):
432
- # Handle list response (if applicable for your use case)
433
- generated_text = json_data[0].get("generated_text") if json_data else None
434
- elif isinstance(json_data, dict):
435
- # Handle dictionary response
436
- generated_text = json_data.get("generated_text")
437
- else:
438
- print("Unexpected response format.")
439
- return None
440
-
441
- if generated_text is not None:
442
- print("Text generation complete using Hugging Face API.")
443
- print(f"Generated text: {generated_text}") # Debugging line
444
- return generated_text
445
- else:
446
- print("Generated text not found in response.")
447
- return None
448
-
449
- except requests.exceptions.RequestException as e:
450
- print(f"Error generating text using Hugging Face API: {e}")
451
- return None
452
 
453
- # Function to read and extract text from a PDF
454
- def read_pdf(file_obj):
455
- with fitz.open(file_obj.name) as document:
456
- text = ""
457
- for page_num in range(document.page_count):
458
- page = document.load_page(page_num)
459
- text += page.get_text()
460
- return text
461
 
462
- # Function to format the prompt with instructions for text generation
463
- def format_prompt_with_instructions(text, instructions):
464
- prompt = f"{instructions}{text}\n\nAssistant:"
465
- return prompt
466
 
467
- # Function to save text to a PDF
468
- def save_text_to_pdf(text, output_path):
469
- print(f"Saving text to PDF at {output_path}...")
470
- doc = fitz.open() # Create a new PDF document
471
- page = doc.new_page() # Create a new page
472
-
473
- # Set the page margins
474
- margin = 50 # 50 points margin
475
- page_width = page.rect.width
476
- page_height = page.rect.height
477
- text_width = page_width - 2 * margin
478
- text_height = page_height - 2 * margin
479
-
480
- # Define font size and line spacing
481
- font_size = 9
482
- line_spacing = 1 * font_size
483
- fontname = "times-roman" # Use a supported font name
484
-
485
- # Process the text to handle line breaks and paragraphs
486
- paragraphs = text.split("\n") # Split text into paragraphs
487
- y_position = margin
488
-
489
- for paragraph in paragraphs:
490
- words = paragraph.split()
491
- current_line = ""
492
-
493
- for word in words:
494
- word = str(word) # Ensure word is treated as string
495
- # Calculate the length of the current line plus the new word
496
- current_line_length = fitz.get_text_length(current_line + " " + word, fontsize=font_size, fontname=fontname)
497
- if current_line_length <= text_width:
498
- current_line += " " + word
499
- else:
500
- page.insert_text(fitz.Point(margin, y_position), current_line.strip(), fontsize=font_size, fontname=fontname)
501
- y_position += line_spacing
502
- if y_position + line_spacing > page_height - margin:
503
- page = doc.new_page() # Add a new page if text exceeds page height
504
- y_position = margin
505
- current_line = word
506
-
507
- # Add the last line of the paragraph
508
- page.insert_text(fitz.Point(margin, y_position), current_line.strip(), fontsize=font_size, fontname=fontname)
509
- y_position += line_spacing
510
-
511
- # Add extra space for new paragraph
512
- y_position += line_spacing
513
- if y_position + line_spacing > page_height - margin:
514
- page = doc.new_page() # Add a new page if text exceeds page height
515
- y_position = margin
516
-
517
- doc.save(output_path) # Save the PDF to the specified path
518
- print("PDF saved successfully.")
519
-
520
- # Integrated function to perform web scraping, formatting, and text generation
521
- def scrape_and_display(query, num_results, instructions, web_search=True, use_news=False, days_back=None, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
522
- print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
523
- if web_search:
524
- if days_back is None:
525
- current_year = datetime.now().year
526
- days_back = 365 if current_year % 4 != 0 else 366 # Account for leap years
527
-
528
- if use_news:
529
- # For news, we might want to use a shorter time frame by default
530
- news_days_back = min(days_back, 30) # Use at most 30 days for news
531
- search_results = google_news_search(query, num_results, days_back=news_days_back)
532
- else:
533
- search_results = google_search(query, num_results=num_results, instructions=instructions, days_back=days_back)
534
-
535
-
536
- # Summarize each result
537
- summarized_results = []
538
- for result in search_results:
539
- try:
540
- summary = summarize_webpage(result['link'], result.get('text'), query, instructions)
541
- summarized_results.append({"link": result['link'], "text": summary})
542
- except Exception as e:
543
- print(f"Error summarizing {result['link']}: {e}")
544
- summarized_results.append({"link": result['link'], "text": f"Error summarizing content: {str(e)}"})
545
-
546
- formatted_prompt = format_prompt(query, summarized_results, instructions)
547
- generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
548
- else:
549
- formatted_prompt = format_prompt_with_instructions(query, instructions)
550
- generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
551
 
552
- print("Scraping and display complete.")
553
- if generated_summary:
554
- assistant_index = generated_summary.find("Assistant:")
555
- if assistant_index != -1:
556
- generated_summary = generated_summary[assistant_index:]
557
- else:
558
- generated_summary = "Assistant: No response generated."
559
- print(f"Generated summary: {generated_summary}")
560
- return generated_summary
561
 
562
- # Main Gradio interface function
563
- def gradio_interface(query, use_dashboard, use_news, use_pdf, pdf, num_results, custom_instructions, temperature, repetition_penalty, top_p, clear_cache_flag):
564
- if clear_cache_flag:
565
- return clear_cache()
566
-
567
- if use_dashboard:
568
- results = []
569
- for query_type, query_info in PREDEFINED_QUERIES.items():
570
- formatted_query = query_info['query'].format(company=query)
571
- formatted_instructions = query_info['instructions'].format(company=query)
572
- result = scrape_and_display(formatted_query, num_results=num_results, instructions=formatted_instructions, web_search=True, use_news=(query_type == "Recent News"), temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
573
- results.append(f"**{query_type}**\n\n{result}\n\n")
574
- generated_summary = "\n".join(results)
575
- elif use_pdf and pdf is not None:
576
- pdf_text = read_pdf(pdf)
577
- generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=custom_instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
578
- else:
579
- generated_summary = scrape_and_display(query, num_results=num_results, instructions=custom_instructions, web_search=True, use_news=use_news, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
580
-
581
- output_pdf_path = "output_summary.pdf"
582
- save_text_to_pdf(generated_summary, output_pdf_path)
583
-
584
- return generated_summary, output_pdf_path
585
 
586
- # Update the Gradio Interface
587
- gr.Interface(
588
- fn=gradio_interface,
589
- inputs=[
590
- gr.Textbox(label="Company Name or Query"),
591
- gr.Checkbox(label="Use Dashboard"),
592
- gr.Checkbox(label="Use News Search"), # New checkbox for news search
593
- gr.Checkbox(label="Use PDF"),
594
- gr.File(label="Upload PDF"),
595
- gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results"),
596
- gr.Textbox(label="Custom Instructions"),
597
- gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
598
- gr.Slider(minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Repetition Penalty"),
599
- gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top p"),
600
- gr.Checkbox(label="Clear Cache", visible=False)
601
- ],
602
- outputs=["text", gr.File(label="Generated PDF")],
603
- title="Financial Analyst AI Assistant",
604
- description="Enter a company name to get a financial dashboard, or enter a custom query. Use the news search option for recent articles. Optionally, upload a PDF for analysis. Adjust parameters as needed for optimal results.",
605
- allow_flagging="never"
606
- ).launch(share=True)
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import gradio as gr
4
+ from huggingface_hub import InferenceClient
5
  import random
6
+ import urllib.parse
 
 
 
 
 
 
 
 
 
7
  from datetime import datetime, timedelta
8
+ import re
9
 
10
+ # List of user agents to rotate through
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  _useragent_list = [
12
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
13
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
 
17
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
18
  ]
19
 
20
+ API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-8b-chat-hf"
21
+ headers = {"Authorization": "Bearer YOUR_HUGGING_FACE_API_KEY"}
22
+
23
+ def query_llama(payload):
24
+ """Send a query to the Llama model via Hugging Face API"""
25
+ response = requests.post(API_URL, headers=headers, json=payload)
26
+ return response.json()
 
 
 
 
 
27
 
28
+ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=90):
29
+ """Perform a Google search and return results"""
30
  print(f"Searching for term: {term}")
31
 
32
  # Calculate the date range
 
38
  end_date_str = end_date.strftime("%Y-%m-%d")
39
 
40
  # Add the date range to the search term
41
+ search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
42
 
43
  escaped_term = urllib.parse.quote_plus(search_term)
44
  start = 0
 
46
 
47
  with requests.Session() as session:
48
  while len(all_results) < num_results:
 
49
  try:
50
  # Choose a random user agent
51
  user_agent = random.choice(_useragent_list)
52
+ headers = {'User-Agent': user_agent}
 
 
 
53
 
54
  resp = session.get(
55
  url="https://www.google.com/search",
 
74
  if not result_block:
75
  print("No more results found.")
76
  break
 
77
 
78
  for result in result_block:
79
  if len(all_results) >= num_results:
 
87
  webpage.raise_for_status()
88
  visible_text = extract_text_from_webpage(webpage.text)
89
 
90
+ all_results.append({"link": link, "text": visible_text})
 
 
 
91
  except requests.exceptions.RequestException as e:
92
  print(f"Error fetching or processing {link}: {e}")
93
  all_results.append({"link": link, "text": None})
 
100
  print(f"Total results fetched: {len(all_results)}")
101
  return all_results
102
 
103
+ def extract_text_from_webpage(html_content):
104
+ """Extract visible text from HTML content"""
105
+ soup = BeautifulSoup(html_content, 'html.parser')
 
 
 
106
 
107
+ # Remove script and style elements
108
+ for script in soup(["script", "style"]):
109
+ script.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ # Get text
112
+ text = soup.get_text()
113
 
114
+ # Break into lines and remove leading and trailing space on each
115
+ lines = (line.strip() for line in text.splitlines())
116
 
117
+ # Break multi-headlines into a line each
118
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
 
 
 
 
 
 
 
 
 
119
 
120
+ # Drop blank lines
121
+ text = '\n'.join(chunk for chunk in chunks if chunk)
122
 
123
  return text
124
 
125
+ def filter_relevant_content(text):
126
+ """Filter out irrelevant content"""
127
+ # List of keywords related to financial reports
128
+ keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
 
 
 
 
129
 
130
+ # Split the text into sentences
131
+ sentences = re.split(r'(?<=[.!?])\s+', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ # Filter sentences containing at least one keyword
134
+ relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
135
 
136
+ # Join the relevant sentences back into a single string
137
+ filtered_text = ' '.join(relevant_sentences)
138
 
139
+ return filtered_text
140
 
141
+ def summarize_financial_news(query):
142
+ """Search for financial news, extract relevant content, and summarize"""
143
+ search_results = google_search(query, num_results=3)
144
 
145
+ all_filtered_text = ""
146
+ for result in search_results:
147
+ if result['text']:
148
+ filtered_text = filter_relevant_content(result['text'])
149
+ all_filtered_text += filtered_text + "\n\n"
150
 
151
+ if not all_filtered_text:
152
+ return "No relevant financial information found."
153
 
154
+ prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ {all_filtered_text}
 
 
 
 
 
 
 
157
 
158
+ Provide a detailed, coherent summary focusing on financial implications and analysis."""
 
 
 
159
 
160
+ summary = query_llama({"inputs": prompt, "parameters": {"max_length": 500}})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ return summary[0]['generated_text']
 
 
 
 
 
 
 
 
163
 
164
+ # Gradio Interface
165
+ iface = gr.Interface(
166
+ fn=summarize_financial_news,
167
+ inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
168
+ outputs="text",
169
+ title="Financial News Summarizer",
170
+ description="Enter a company name or financial topic to get a summary of recent financial news."
171
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ iface.launch()