import requests from bs4 import BeautifulSoup import gradio as gr from huggingface_hub import InferenceClient import random import urllib.parse from datetime import datetime, timedelta import re import os import PyPDF2 # List of user agents to rotate through _useragent_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", ] API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct" headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"} def query_llama(payload): """Send a query to the Llama model via Hugging Face API""" try: print(f"Payload: {payload}") # Debug: Print payload response = requests.post(API_URL, headers=headers, json=payload) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: print(f"Error querying Llama model: {e}") return None def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90): """Perform a Google search and return results""" print(f"Searching for term: {term}") # Calculate the date range end_date = datetime.now() start_date = end_date - timedelta(days=days_back) # Format dates as strings start_date_str = start_date.strftime("%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d") # Add the date range to the search term search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}" escaped_term = urllib.parse.quote_plus(search_term) start = 0 all_results = [] max_attempts = num_results * 2 # Allow for some failed attempts with requests.Session() as session: attempts = 0 while len(all_results) < num_results and attempts < max_attempts: try: # Choose a random user agent user_agent = random.choice(_useragent_list) headers = {'User-Agent': user_agent} resp = session.get( url="https://www.google.com/search", headers=headers, params={ "q": search_term, "num": num_results - len(all_results), "hl": lang, "start": start, "safe": safe, }, timeout=timeout, verify=ssl_verify, ) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") result_block = soup.find_all("div", attrs={"class": "g"}) if not result_block: print("No more results found.") break for result in result_block: if len(all_results) >= num_results: break link = result.find("a", href=True) if link: link = link["href"] print(f"Found link: {link}") try: webpage = session.get(link, headers=headers, timeout=timeout) webpage.raise_for_status() visible_text = extract_text_from_webpage(webpage.text) all_results.append({"link": link, "text": visible_text}) except requests.exceptions.HTTPError as e: if e.response.status_code == 403: print(f"403 Forbidden error for {link}, skipping...") else: print(f"HTTP error {e.response.status_code} for {link}, skipping...") except requests.exceptions.RequestException as e: print(f"Error fetching or processing {link}: {e}") else: print("No link found in result.") start += len(result_block) attempts += 1 except requests.exceptions.RequestException as e: print(f"Error fetching search results: {e}") attempts += 1 print(f"Total results fetched: {len(all_results)}") return all_results def extract_text_from_webpage(html_content): """Extract visible text from HTML content""" soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text def filter_relevant_content(text): """Filter out irrelevant content""" # List of keywords related to financial reports keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend'] # Split the text into sentences sentences = re.split(r'(?<=[.!?])\s+', text) # Filter sentences containing at least one keyword relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)] # Join the relevant sentences back into a single string filtered_text = ' '.join(relevant_sentences) return filtered_text def chunk_text(text, max_chunk_size=1000, overlap=100): # List of keywords that might indicate new sections section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"] # Split text into sentences sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) > max_chunk_size: # If adding this sentence exceeds max_chunk_size, start a new chunk chunks.append(current_chunk.strip()) current_chunk = sentence + " " elif any(keyword in sentence.lower() for keyword in section_keywords): # If sentence contains a section keyword, start a new chunk if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + " " else: current_chunk += sentence + " " # Add the last chunk if it's not empty if current_chunk: chunks.append(current_chunk.strip()) # Add overlap overlapped_chunks = [] for i, chunk in enumerate(chunks): if i > 0: chunk = chunks[i-1][-overlap:] + chunk if i < len(chunks) - 1: chunk = chunk + chunks[i+1][:overlap] overlapped_chunks.append(chunk) return overlapped_chunks def summarize_text(text, context_instructions): chunks = chunk_text(text, max_chunk_size=3000, overlap=200) summaries = [] for chunk in chunks: prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective: {chunk} {context_instructions}""" summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}}) if summary and isinstance(summary, list) and 'generated_text' in summary[0]: summaries.append(summary[0]['generated_text']) # Combine summaries combined_summary = "\n\n".join(summaries) # Final summarization of combined summaries final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information: {combined_summary} Focus on the most important financial implications and analysis.""" final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}}) if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]: return final_summary[0]['generated_text'] else: return "Unable to generate summary due to an error." def summarize_financial_news(query, read_pdf=False, pdf=None): """Search for financial news, extract relevant content , and summarize""" all_filtered_text = "" if read_pdf and pdf is not None: pdf_text = extract_text_from_pdf(pdf) all_filtered_text += pdf_text + "\n\n" else: search_results = google_search(query, num_results=1) for result in search_results: if result['text']: filtered_text = filter_relevant_content(result['text']) all_filtered_text += filtered_text + "\n\n" if not all_filtered_text: return "No relevant financial information found." context_instructions = "Provide a detailed, coherent summary focusing on financial implications and analysis." return summarize_text(all_filtered_text, context_instructions) def extract_text_from_pdf(pdf): """Extract text from each page of the PDF""" reader = PyPDF2.PdfFileReader(pdf) text = "" for page_num in range(reader.getNumPages()): page = reader.getPage(page_num) text += page.extract_text() + "\n" return text # Gradio Interface def interface_function(query, read_pdf, pdf): return summarize_financial_news(query, read_pdf, pdf) iface = gr.Interface( fn=interface_function, inputs=[ gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."), gr.Checkbox(label="Read PDF"), gr.File(label="Upload PDF", type="file") ], outputs="text", title="Financial News Summarizer", description="Enter a company name or financial topic to get a summary of recent financial news. Optionally, upload a PDF to summarize its content." ) iface.launch()