Shreyas094 commited on
Commit
302823e
·
verified ·
1 Parent(s): 14fbe41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -217
app.py CHANGED
@@ -7,236 +7,212 @@ import urllib.parse
7
  from datetime import datetime, timedelta
8
  import re
9
  import os
10
-
11
  # List of user agents to rotate through
12
  _useragent_list = [
13
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
14
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
15
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
16
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
17
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
18
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
19
  ]
20
-
21
  API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
22
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
23
-
24
  def query_llama(payload):
25
- """Send a query to the Llama model via Hugging Face API"""
26
- try:
27
- print(f"Payload: {payload}") # Debug: Print payload
28
- response = requests.post(API_URL, headers=headers, json=payload)
29
- response.raise_for_status()
30
- return response.json()
31
- except requests.exceptions.RequestException as e:
32
- print(f"Error querying Llama model: {e}")
33
- return None
34
-
35
  def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90):
36
- """Perform a Google search and return results"""
37
- print(f"Searching for term: {term}")
38
-
39
- # Calculate the date range
40
- end_date = datetime.now()
41
- start_date = end_date - timedelta(days=days_back)
42
-
43
- # Format dates as strings
44
- start_date_str = start_date.strftime("%Y-%m-%d")
45
- end_date_str = end_date.strftime("%Y-%m-%d")
46
-
47
- # Add the date range to the search term
48
- search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
49
-
50
- escaped_term = urllib.parse.quote_plus(search_term)
51
- start = 0
52
- all_results = []
53
- max_attempts = num_results * 2 # Allow for some failed attempts
54
-
55
- with requests.Session() as session:
56
- attempts = 0
57
- while len(all_results) < num_results and attempts < max_attempts:
58
- try:
59
- # Choose a random user agent
60
- user_agent = random.choice(_useragent_list)
61
- headers = {'User-Agent': user_agent}
62
-
63
- resp = session.get(
64
- url="https://www.google.com/search",
65
- headers=headers,
66
- params={
67
- "q": search_term,
68
- "num": num_results - len(all_results),
69
- "hl": lang,
70
- "start": start,
71
- "safe": safe,
72
- },
73
- timeout=timeout,
74
- verify=ssl_verify,
75
- )
76
- resp.raise_for_status()
77
-
78
- soup = BeautifulSoup(resp.text, "html.parser")
79
- result_block = soup.find_all("div", attrs={"class": "g"})
80
-
81
- if not result_block:
82
- print("No more results found.")
83
- break
84
-
85
- for result in result_block:
86
- if len(all_results) >= num_results:
87
- break
88
- link = result.find("a", href=True)
89
- if link:
90
- link = link["href"]
91
- print(f"Found link: {link}")
92
- try:
93
- webpage = session.get(link, headers=headers, timeout=timeout)
94
- webpage.raise_for_status()
95
- visible_text = extract_text_from_webpage(webpage.text)
96
- all_results.append({"link": link, "text": visible_text})
97
- except requests.exceptions.HTTPError as e:
98
- if e.response.status_code == 403:
99
- print(f"403 Forbidden error for {link}, skipping...")
100
- else:
101
- print(f"HTTP error {e.response.status_code} for {link}, skipping...")
102
- except requests.exceptions.RequestException as e:
103
- print(f"Error fetching or processing {link}: {e}")
104
- else:
105
- print("No link found in result.")
106
-
107
- start += len(result_block)
108
- attempts += 1
109
- except requests.exceptions.RequestException as e:
110
- print(f"Error fetching search results: {e}")
111
- attempts += 1
112
-
113
- print(f"Total results fetched: {len(all_results)}")
114
- return all_results
115
-
116
  def extract_text_from_webpage(html_content):
117
- """Extract visible text from HTML content"""
118
- soup = BeautifulSoup(html_content, 'html.parser')
119
-
120
- # Remove script and style elements
121
- for script in soup(["script", "style"]):
122
- script.decompose()
123
-
124
- # Get text
125
- text = soup.get_text()
126
-
127
- # Break into lines and remove leading and trailing space on each
128
- lines = (line.strip() for line in text.splitlines())
129
-
130
- # Break multi-headlines into a line each
131
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
132
-
133
- # Drop blank lines
134
- text = '\n'.join(chunk for chunk in chunks if chunk)
135
-
136
- return text
137
-
138
  def filter_relevant_content(text):
139
- """Filter out irrelevant content"""
140
- # List of keywords related to financial reports
141
- keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
142
-
143
- # Split the text into sentences
144
- sentences = re.split(r'(?<=[.!?])\s+', text)
145
-
146
- # Filter sentences containing at least one keyword
147
- relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
148
-
149
- # Join the relevant sentences back into a single string
150
- filtered_text = ' '.join(relevant_sentences)
151
-
152
- return filtered_text
153
-
154
  def chunk_text(text, max_chunk_size=1000, overlap=100):
155
- # List of keywords that might indicate new sections
156
- section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
157
-
158
- # Split text into sentences
159
- sentences = re.split(r'(?<=[.!?])\s+', text)
160
-
161
- chunks = []
162
- current_chunk = ""
163
-
164
- for sentence in sentences:
165
- if len(current_chunk) + len(sentence) > max_chunk_size:
166
- # If adding this sentence exceeds max_chunk_size, start a new chunk
167
- chunks.append(current_chunk.strip())
168
- current_chunk = sentence + " "
169
- elif any(keyword in sentence.lower() for keyword in section_keywords):
170
- # If sentence contains a section keyword, start a new chunk
171
- if current_chunk:
172
- chunks.append(current_chunk.strip())
173
- current_chunk = sentence + " "
174
- else:
175
- current_chunk += sentence + " "
176
-
177
- # Add the last chunk if it's not empty
178
- if current_chunk:
179
- chunks.append(current_chunk.strip())
180
-
181
- # Add overlap
182
- overlapped_chunks = []
183
- for i, chunk in enumerate(chunks):
184
- if i > 0:
185
- chunk = chunks[i-1][-overlap:] + chunk
186
- if i < len(chunks) - 1:
187
- chunk = chunk + chunks[i+1][:overlap]
188
- overlapped_chunks.append(chunk)
189
-
190
- return overlapped_chunks
191
-
192
- def summarize_financial_news(query):
193
- """Search for financial news, extract relevant content, and summarize"""
194
- search_results = google_search(query, num_results=1)
195
-
196
- all_filtered_text = ""
197
- for result in search_results:
198
- if result['text']:
199
- filtered_text = filter_relevant_content(result['text'])
200
- all_filtered_text += filtered_text + "\n\n"
201
-
202
- if not all_filtered_text:
203
- return "No relevant financial information found."
204
-
205
- # Chunk the filtered text
206
- chunks = chunk_text(all_filtered_text, max_chunk_size=3000, overlap=200)
207
-
208
- summaries = []
209
- for chunk in chunks:
210
- prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
211
  {chunk}
212
- Provide a detailed, coherent summary focusing on financial implications and analysis."""
213
-
214
- summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
215
- if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
216
- summaries.append(summary[0]['generated_text'])
217
-
218
- # Combine summaries
219
- combined_summary = "\n\n".join(summaries)
220
-
221
- # Final summarization of combined summaries
222
- final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
223
  {combined_summary}
224
  Focus on the most important financial implications and analysis."""
225
-
226
- final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
227
-
228
- if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
229
- return final_summary[0]['generated_text']
230
- else:
231
- return "Unable to generate summary due to an error."
232
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  # Gradio Interface
 
 
234
  iface = gr.Interface(
235
- fn=summarize_financial_news,
236
- inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
237
- outputs="text",
238
- title="Financial News Summarizer",
239
- description="Enter a company name or financial topic to get a summary of recent financial news."
 
 
 
 
240
  )
241
-
242
- iface.launch()
 
7
  from datetime import datetime, timedelta
8
  import re
9
  import os
10
+ import PyPDF2
11
  # List of user agents to rotate through
12
  _useragent_list = [
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
14
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
15
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
16
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
17
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
18
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
19
  ]
 
20
  API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
21
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
 
22
  def query_llama(payload):
23
+ """Send a query to the Llama model via Hugging Face API"""
24
+ try:
25
+ print(f"Payload: {payload}") # Debug: Print payload
26
+ response = requests.post(API_URL, headers=headers, json=payload)
27
+ response.raise_for_status()
28
+ return response.json()
29
+ except requests.exceptions.RequestException as e:
30
+ print(f"Error querying Llama model: {e}")
31
+ return None
 
32
  def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90):
33
+ """Perform a Google search and return results"""
34
+ print(f"Searching for term: {term}")
35
+ # Calculate the date range
36
+ end_date = datetime.now()
37
+ start_date = end_date - timedelta(days=days_back)
38
+ # Format dates as strings
39
+ start_date_str = start_date.strftime("%Y-%m-%d")
40
+ end_date_str = end_date.strftime("%Y-%m-%d")
41
+ # Add the date range to the search term
42
+ search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
43
+ escaped_term = urllib.parse.quote_plus(search_term)
44
+ start = 0
45
+ all_results = []
46
+ max_attempts = num_results * 2 # Allow for some failed attempts
47
+ with requests.Session() as session:
48
+ attempts = 0
49
+ while len(all_results) < num_results and attempts < max_attempts:
50
+ try:
51
+ # Choose a random user agent
52
+ user_agent = random.choice(_useragent_list)
53
+ headers = {'User-Agent': user_agent}
54
+ resp = session.get(
55
+ url="https://www.google.com/search",
56
+ headers=headers,
57
+ params={
58
+ "q": search_term,
59
+ "num": num_results - len(all_results),
60
+ "hl": lang,
61
+ "start": start,
62
+ "safe": safe,
63
+ },
64
+ timeout=timeout,
65
+ verify=ssl_verify,
66
+ )
67
+ resp.raise_for_status()
68
+ soup = BeautifulSoup(resp.text, "html.parser")
69
+ result_block = soup.find_all("div", attrs={"class": "g"})
70
+ if not result_block:
71
+ print("No more results found.")
72
+ break
73
+ for result in result_block:
74
+ if len(all_results) >= num_results:
75
+ break
76
+ link = result.find("a", href=True)
77
+ if link:
78
+ link = link["href"]
79
+ print(f"Found link: {link}")
80
+ try:
81
+ webpage = session.get(link, headers=headers, timeout=timeout)
82
+ webpage.raise_for_status()
83
+ visible_text = extract_text_from_webpage(webpage.text)
84
+ all_results.append({"link": link, "text": visible_text})
85
+ except requests.exceptions.HTTPError as e:
86
+ if e.response.status_code == 403:
87
+ print(f"403 Forbidden error for {link}, skipping...")
88
+ else:
89
+ print(f"HTTP error {e.response.status_code} for {link}, skipping...")
90
+ except requests.exceptions.RequestException as e:
91
+ print(f"Error fetching or processing {link}: {e}")
92
+ else:
93
+ print("No link found in result.")
94
+ start += len(result_block)
95
+ attempts += 1
96
+ except requests.exceptions.RequestException as e:
97
+ print(f"Error fetching search results: {e}")
98
+ attempts += 1
99
+ print(f"Total results fetched: {len(all_results)}")
100
+ return all_results
 
 
 
 
 
 
 
 
 
 
 
 
101
  def extract_text_from_webpage(html_content):
102
+ """Extract visible text from HTML content"""
103
+ soup = BeautifulSoup(html_content, 'html.parser')
104
+ # Remove script and style elements
105
+ for script in soup(["script", "style"]):
106
+ script.decompose()
107
+ # Get text
108
+ text = soup.get_text()
109
+ # Break into lines and remove leading and trailing space on each
110
+ lines = (line.strip() for line in text.splitlines())
111
+ # Break multi-headlines into a line each
112
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
113
+ # Drop blank lines
114
+ text = '\n'.join(chunk for chunk in chunks if chunk)
115
+ return text
 
 
 
 
 
 
 
116
  def filter_relevant_content(text):
117
+ """Filter out irrelevant content"""
118
+ # List of keywords related to financial reports
119
+ keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
120
+ # Split the text into sentences
121
+ sentences = re.split(r'(?<=[.!?])\s+', text)
122
+ # Filter sentences containing at least one keyword
123
+ relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
124
+ # Join the relevant sentences back into a single string
125
+ filtered_text = ' '.join(relevant_sentences)
126
+ return filtered_text
 
 
 
 
 
127
  def chunk_text(text, max_chunk_size=1000, overlap=100):
128
+ # List of keywords that might indicate new sections
129
+ section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
130
+ # Split text into sentences
131
+ sentences = re.split(r'(?<=[.!?])\s+', text)
132
+ chunks = []
133
+ current_chunk = ""
134
+ for sentence in sentences:
135
+ if len(current_chunk) + len(sentence) > max_chunk_size:
136
+ # If adding this sentence exceeds max_chunk_size, start a new chunk
137
+ chunks.append(current_chunk.strip())
138
+ current_chunk = sentence + " "
139
+ elif any(keyword in sentence.lower() for keyword in section_keywords):
140
+ # If sentence contains a section keyword, start a new chunk
141
+ if current_chunk:
142
+ chunks.append(current_chunk.strip())
143
+ current_chunk = sentence + " "
144
+ else:
145
+ current_chunk += sentence + " "
146
+ # Add the last chunk if it's not empty
147
+ if current_chunk:
148
+ chunks.append(current_chunk.strip())
149
+ # Add overlap
150
+ overlapped_chunks = []
151
+ for i, chunk in enumerate(chunks):
152
+ if i > 0:
153
+ chunk = chunks[i-1][-overlap:] + chunk
154
+ if i < len(chunks) - 1:
155
+ chunk = chunk + chunks[i+1][:overlap]
156
+ overlapped_chunks.append(chunk)
157
+ return overlapped_chunks
158
+ def summarize_text(text, context_instructions):
159
+ chunks = chunk_text(text, max_chunk_size=3000, overlap=200)
160
+ summaries = []
161
+ for chunk in chunks:
162
+ prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  {chunk}
164
+ {context_instructions}"""
165
+ summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
166
+ if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
167
+ summaries.append(summary[0]['generated_text'])
168
+ # Combine summaries
169
+ combined_summary = "\n\n".join(summaries)
170
+ # Final summarization of combined summaries
171
+ final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
 
 
 
172
  {combined_summary}
173
  Focus on the most important financial implications and analysis."""
174
+ final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
175
+ if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
176
+ return final_summary[0]['generated_text']
177
+ else:
178
+ return "Unable to generate summary due to an error."
179
+ def summarize_financial_news(query, read_pdf=False, pdf=None):
180
+ """Search for financial news, extract relevant content
181
+ , and summarize"""
182
+ all_filtered_text = ""
183
+ if read_pdf and pdf is not None:
184
+ pdf_text = extract_text_from_pdf(pdf)
185
+ all_filtered_text += pdf_text + "\n\n"
186
+ else:
187
+ search_results = google_search(query, num_results=1)
188
+ for result in search_results:
189
+ if result['text']:
190
+ filtered_text = filter_relevant_content(result['text'])
191
+ all_filtered_text += filtered_text + "\n\n"
192
+ if not all_filtered_text:
193
+ return "No relevant financial information found."
194
+ context_instructions = "Provide a detailed, coherent summary focusing on financial implications and analysis."
195
+ return summarize_text(all_filtered_text, context_instructions)
196
+ def extract_text_from_pdf(pdf):
197
+ """Extract text from each page of the PDF"""
198
+ reader = PyPDF2.PdfFileReader(pdf)
199
+ text = ""
200
+ for page_num in range(reader.getNumPages()):
201
+ page = reader.getPage(page_num)
202
+ text += page.extract_text() + "\n"
203
+ return text
204
  # Gradio Interface
205
+ def interface_function(query, read_pdf, pdf):
206
+ return summarize_financial_news(query, read_pdf, pdf)
207
  iface = gr.Interface(
208
+ fn=interface_function,
209
+ inputs=[
210
+ gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
211
+ gr.Checkbox(label="Read PDF"),
212
+ gr.File(label="Upload PDF", type="file")
213
+ ],
214
+ outputs="text",
215
+ title="Financial News Summarizer",
216
+ description="Enter a company name or financial topic to get a summary of recent financial news. Optionally, upload a PDF to summarize its content."
217
  )
218
+ iface.launch()