Shreyas094's picture
Update app.py
a7932b8 verified
raw
history blame
6.82 kB
import requests
from bs4 import BeautifulSoup
import gradio as gr
from huggingface_hub import InferenceClient
import random
import urllib.parse
from datetime import datetime, timedelta
import re
import os
# List of user agents to rotate through
_useragent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
]
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
def query_llama(payload):
"""Send a query to the Llama model via Hugging Face API"""
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=90):
"""Perform a Google search and return results"""
print(f"Searching for term: {term}")
# Calculate the date range
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
# Format dates as strings
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
# Add the date range to the search term
search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
escaped_term = urllib.parse.quote_plus(search_term)
start = 0
all_results = []
with requests.Session() as session:
while len(all_results) < num_results:
try:
# Choose a random user agent
user_agent = random.choice(_useragent_list)
headers = {'User-Agent': user_agent}
resp = session.get(
url="https://www.google.com/search",
headers=headers,
params={
"q": search_term,
"num": num_results - start,
"hl": lang,
"start": start,
"safe": safe,
},
timeout=timeout,
verify=ssl_verify,
)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching search results: {e}")
break
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", attrs={"class": "g"})
if not result_block:
print("No more results found.")
break
for result in result_block:
if len(all_results) >= num_results:
break
link = result.find("a", href=True)
if link:
link = link["href"]
print(f"Found link: {link}")
try:
webpage = session.get(link, headers=headers, timeout=timeout)
webpage.raise_for_status()
visible_text = extract_text_from_webpage(webpage.text)
all_results.append({"link": link, "text": visible_text})
except requests.exceptions.RequestException as e:
print(f"Error fetching or processing {link}: {e}")
all_results.append({"link": link, "text": None})
else:
print("No link found in result.")
all_results.append({"link": None, "text": None})
start += len(result_block)
print(f"Total results fetched: {len(all_results)}")
return all_results
def extract_text_from_webpage(html_content):
"""Extract visible text from HTML content"""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def filter_relevant_content(text):
"""Filter out irrelevant content"""
# List of keywords related to financial reports
keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
# Split the text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter sentences containing at least one keyword
relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
# Join the relevant sentences back into a single string
filtered_text = ' '.join(relevant_sentences)
return filtered_text
def summarize_financial_news(query):
"""Search for financial news, extract relevant content, and summarize"""
search_results = google_search(query, num_results=3)
all_filtered_text = ""
for result in search_results:
if result['text']:
filtered_text = filter_relevant_content(result['text'])
all_filtered_text += filtered_text + "\n\n"
if not all_filtered_text:
return "No relevant financial information found."
prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
{all_filtered_text}
Provide a detailed, coherent summary focusing on financial implications and analysis."""
summary = query_llama({"inputs": prompt, "parameters": {"max_length": 500}})
return summary[0]['generated_text']
# Gradio Interface
iface = gr.Interface(
fn=summarize_financial_news,
inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
outputs="text",
title="Financial News Summarizer",
description="Enter a company name or financial topic to get a summary of recent financial news."
)
iface.launch()