import requests from bs4 import BeautifulSoup import gradio as gr from huggingface_hub import InferenceClient import random import urllib.parse from datetime import datetime, timedelta import re import os # List of user agents to rotate through _useragent_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", ] API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"} def query_llama(payload): """Send a query to the Llama model via Hugging Face API""" response = requests.post(API_URL, headers=headers, json=payload) return response.json() def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=90): """Perform a Google search and return results""" print(f"Searching for term: {term}") # Calculate the date range end_date = datetime.now() start_date = end_date - timedelta(days=days_back) # Format dates as strings start_date_str = start_date.strftime("%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d") # Add the date range to the search term search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}" escaped_term = urllib.parse.quote_plus(search_term) start = 0 all_results = [] with requests.Session() as session: while len(all_results) < num_results: try: # Choose a random user agent user_agent = random.choice(_useragent_list) headers = {'User-Agent': user_agent} resp = session.get( url="https://www.google.com/search", headers=headers, params={ "q": search_term, "num": num_results - start, "hl": lang, "start": start, "safe": safe, }, timeout=timeout, verify=ssl_verify, ) resp.raise_for_status() except requests.exceptions.RequestException as e: print(f"Error fetching search results: {e}") break soup = BeautifulSoup(resp.text, "html.parser") result_block = soup.find_all("div", attrs={"class": "g"}) if not result_block: print("No more results found.") break for result in result_block: if len(all_results) >= num_results: break link = result.find("a", href=True) if link: link = link["href"] print(f"Found link: {link}") try: webpage = session.get(link, headers=headers, timeout=timeout) webpage.raise_for_status() visible_text = extract_text_from_webpage(webpage.text) all_results.append({"link": link, "text": visible_text}) except requests.exceptions.RequestException as e: print(f"Error fetching or processing {link}: {e}") all_results.append({"link": link, "text": None}) else: print("No link found in result.") all_results.append({"link": None, "text": None}) start += len(result_block) print(f"Total results fetched: {len(all_results)}") return all_results def extract_text_from_webpage(html_content): """Extract visible text from HTML content""" soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text def filter_relevant_content(text): """Filter out irrelevant content""" # List of keywords related to financial reports keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend'] # Split the text into sentences sentences = re.split(r'(?<=[.!?])\s+', text) # Filter sentences containing at least one keyword relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)] # Join the relevant sentences back into a single string filtered_text = ' '.join(relevant_sentences) return filtered_text def summarize_financial_news(query): """Search for financial news, extract relevant content, and summarize""" search_results = google_search(query, num_results=3) all_filtered_text = "" for result in search_results: if result['text']: filtered_text = filter_relevant_content(result['text']) all_filtered_text += filtered_text + "\n\n" if not all_filtered_text: return "No relevant financial information found." prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective: {all_filtered_text} Provide a detailed, coherent summary focusing on financial implications and analysis.""" summary = query_llama({"inputs": prompt, "parameters": {"max_length": 500}}) return summary[0]['generated_text'] # Gradio Interface iface = gr.Interface( fn=summarize_financial_news, inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."), outputs="text", title="Financial News Summarizer", description="Enter a company name or financial topic to get a summary of recent financial news." ) iface.launch()