Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
import random | |
import urllib.parse | |
from datetime import datetime, timedelta | |
import re | |
import os | |
# List of user agents to rotate through | |
_useragent_list = [ | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", | |
] | |
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" | |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"} | |
def query_llama(payload): | |
"""Send a query to the Llama model via Hugging Face API""" | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=90): | |
"""Perform a Google search and return results""" | |
print(f"Searching for term: {term}") | |
# Calculate the date range | |
end_date = datetime.now() | |
start_date = end_date - timedelta(days=days_back) | |
# Format dates as strings | |
start_date_str = start_date.strftime("%Y-%m-%d") | |
end_date_str = end_date.strftime("%Y-%m-%d") | |
# Add the date range to the search term | |
search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}" | |
escaped_term = urllib.parse.quote_plus(search_term) | |
start = 0 | |
all_results = [] | |
with requests.Session() as session: | |
while len(all_results) < num_results: | |
try: | |
# Choose a random user agent | |
user_agent = random.choice(_useragent_list) | |
headers = {'User-Agent': user_agent} | |
resp = session.get( | |
url="https://www.google.com/search", | |
headers=headers, | |
params={ | |
"q": search_term, | |
"num": num_results - start, | |
"hl": lang, | |
"start": start, | |
"safe": safe, | |
}, | |
timeout=timeout, | |
verify=ssl_verify, | |
) | |
resp.raise_for_status() | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching search results: {e}") | |
break | |
soup = BeautifulSoup(resp.text, "html.parser") | |
result_block = soup.find_all("div", attrs={"class": "g"}) | |
if not result_block: | |
print("No more results found.") | |
break | |
for result in result_block: | |
if len(all_results) >= num_results: | |
break | |
link = result.find("a", href=True) | |
if link: | |
link = link["href"] | |
print(f"Found link: {link}") | |
try: | |
webpage = session.get(link, headers=headers, timeout=timeout) | |
webpage.raise_for_status() | |
visible_text = extract_text_from_webpage(webpage.text) | |
all_results.append({"link": link, "text": visible_text}) | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching or processing {link}: {e}") | |
all_results.append({"link": link, "text": None}) | |
else: | |
print("No link found in result.") | |
all_results.append({"link": None, "text": None}) | |
start += len(result_block) | |
print(f"Total results fetched: {len(all_results)}") | |
return all_results | |
def extract_text_from_webpage(html_content): | |
"""Extract visible text from HTML content""" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Remove script and style elements | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Get text | |
text = soup.get_text() | |
# Break into lines and remove leading and trailing space on each | |
lines = (line.strip() for line in text.splitlines()) | |
# Break multi-headlines into a line each | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
# Drop blank lines | |
text = '\n'.join(chunk for chunk in chunks if chunk) | |
return text | |
def filter_relevant_content(text): | |
"""Filter out irrelevant content""" | |
# List of keywords related to financial reports | |
keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend'] | |
# Split the text into sentences | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
# Filter sentences containing at least one keyword | |
relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)] | |
# Join the relevant sentences back into a single string | |
filtered_text = ' '.join(relevant_sentences) | |
return filtered_text | |
def summarize_financial_news(query): | |
"""Search for financial news, extract relevant content, and summarize""" | |
search_results = google_search(query, num_results=3) | |
all_filtered_text = "" | |
for result in search_results: | |
if result['text']: | |
filtered_text = filter_relevant_content(result['text']) | |
all_filtered_text += filtered_text + "\n\n" | |
if not all_filtered_text: | |
return "No relevant financial information found." | |
prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective: | |
{all_filtered_text} | |
Provide a detailed, coherent summary focusing on financial implications and analysis.""" | |
summary = query_llama({"inputs": prompt, "parameters": {"max_length": 500}}) | |
return summary[0]['generated_text'] | |
# Gradio Interface | |
iface = gr.Interface( | |
fn=summarize_financial_news, | |
inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."), | |
outputs="text", | |
title="Financial News Summarizer", | |
description="Enter a company name or financial topic to get a summary of recent financial news." | |
) | |
iface.launch() |