Spaces:
Sleeping
Sleeping
File size: 6,824 Bytes
a2335c5 10e4113 c1cdf7c 10e4113 e283410 10e4113 a7932b8 10e4113 a2335c5 407de98 ef5fd00 10e4113 a2335c5 10e4113 a2335c5 00ac423 10e4113 00ac423 a2335c5 7d6eec9 a2335c5 328806f a2335c5 10e4113 7d6eec9 a2335c5 00ac423 a2335c5 7d6eec9 a2335c5 e2808f2 a2335c5 328806f a2335c5 e2808f2 10e4113 a2335c5 328806f a2335c5 328806f a2335c5 10e4113 b50112d 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 e2808f2 10e4113 c1cdf7c 10e4113 34054e0 10e4113 a65ba38 10e4113 ab3adb5 10e4113 c1cdf7c 10e4113 34054e0 10e4113 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import requests
from bs4 import BeautifulSoup
import gradio as gr
from huggingface_hub import InferenceClient
import random
import urllib.parse
from datetime import datetime, timedelta
import re
import os
# List of user agents to rotate through
_useragent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
]
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
def query_llama(payload):
"""Send a query to the Llama model via Hugging Face API"""
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=90):
"""Perform a Google search and return results"""
print(f"Searching for term: {term}")
# Calculate the date range
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
# Format dates as strings
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
# Add the date range to the search term
search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
escaped_term = urllib.parse.quote_plus(search_term)
start = 0
all_results = []
with requests.Session() as session:
while len(all_results) < num_results:
try:
# Choose a random user agent
user_agent = random.choice(_useragent_list)
headers = {'User-Agent': user_agent}
resp = session.get(
url="https://www.google.com/search",
headers=headers,
params={
"q": search_term,
"num": num_results - start,
"hl": lang,
"start": start,
"safe": safe,
},
timeout=timeout,
verify=ssl_verify,
)
resp.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching search results: {e}")
break
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", attrs={"class": "g"})
if not result_block:
print("No more results found.")
break
for result in result_block:
if len(all_results) >= num_results:
break
link = result.find("a", href=True)
if link:
link = link["href"]
print(f"Found link: {link}")
try:
webpage = session.get(link, headers=headers, timeout=timeout)
webpage.raise_for_status()
visible_text = extract_text_from_webpage(webpage.text)
all_results.append({"link": link, "text": visible_text})
except requests.exceptions.RequestException as e:
print(f"Error fetching or processing {link}: {e}")
all_results.append({"link": link, "text": None})
else:
print("No link found in result.")
all_results.append({"link": None, "text": None})
start += len(result_block)
print(f"Total results fetched: {len(all_results)}")
return all_results
def extract_text_from_webpage(html_content):
"""Extract visible text from HTML content"""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text
text = soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def filter_relevant_content(text):
"""Filter out irrelevant content"""
# List of keywords related to financial reports
keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
# Split the text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter sentences containing at least one keyword
relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
# Join the relevant sentences back into a single string
filtered_text = ' '.join(relevant_sentences)
return filtered_text
def summarize_financial_news(query):
"""Search for financial news, extract relevant content, and summarize"""
search_results = google_search(query, num_results=3)
all_filtered_text = ""
for result in search_results:
if result['text']:
filtered_text = filter_relevant_content(result['text'])
all_filtered_text += filtered_text + "\n\n"
if not all_filtered_text:
return "No relevant financial information found."
prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
{all_filtered_text}
Provide a detailed, coherent summary focusing on financial implications and analysis."""
summary = query_llama({"inputs": prompt, "parameters": {"max_length": 500}})
return summary[0]['generated_text']
# Gradio Interface
iface = gr.Interface(
fn=summarize_financial_news,
inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
outputs="text",
title="Financial News Summarizer",
description="Enter a company name or financial topic to get a summary of recent financial news."
)
iface.launch() |