SearchGPT

Running

App Files Files Community

Shreyas094 commited on Jul 1, 2024

Commit

302823e

verified ·

1 Parent(s): 14fbe41

Update app.py

Browse files

Files changed (1) hide show

app.py +193 -217

app.py CHANGED Viewed

@@ -7,236 +7,212 @@ import urllib.parse
 from datetime import datetime, timedelta
 import re
 import os
 # List of user agents to rotate through
 _useragent_list = [
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
 ]
 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
 headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
 def query_llama(payload):
-    """Send a query to the Llama model via Hugging Face API"""
-    try:
-        print(f"Payload: {payload}")  # Debug: Print payload
-        response = requests.post(API_URL, headers=headers, json=payload)
-        response.raise_for_status()
-        return response.json()
-    except requests.exceptions.RequestException as e:
-        print(f"Error querying Llama model: {e}")
-        return None
 def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90):
-    """Perform a Google search and return results"""
-    print(f"Searching for term: {term}")
-    # Calculate the date range
-    end_date = datetime.now()
-    start_date = end_date - timedelta(days=days_back)
-    # Format dates as strings
-    start_date_str = start_date.strftime("%Y-%m-%d")
-    end_date_str = end_date.strftime("%Y-%m-%d")
-    # Add the date range to the search term
-    search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
-    escaped_term = urllib.parse.quote_plus(search_term)
-    start = 0
-    all_results = []
-    max_attempts = num_results * 2  # Allow for some failed attempts
-    with requests.Session() as session:
-        attempts = 0
-        while len(all_results) < num_results and attempts < max_attempts:
-            try:
-                # Choose a random user agent
-                user_agent = random.choice(_useragent_list)
-                headers = {'User-Agent': user_agent}
-                resp = session.get(
-                    url="https://www.google.com/search",
-                    headers=headers,
-                    params={
-                        "q": search_term,
-                        "num": num_results - len(all_results),
-                        "hl": lang,
-                        "start": start,
-                        "safe": safe,
-                    },
-                    timeout=timeout,
-                    verify=ssl_verify,
-                )
-                resp.raise_for_status()
-                soup = BeautifulSoup(resp.text, "html.parser")
-                result_block = soup.find_all("div", attrs={"class": "g"})
-                if not result_block:
-                    print("No more results found.")
-                    break
-                for result in result_block:
-                    if len(all_results) >= num_results:
-                        break
-                    link = result.find("a", href=True)
-                    if link:
-                        link = link["href"]
-                        print(f"Found link: {link}")
-                        try:
-                            webpage = session.get(link, headers=headers, timeout=timeout)
-                            webpage.raise_for_status()
-                            visible_text = extract_text_from_webpage(webpage.text)
-                            all_results.append({"link": link, "text": visible_text})
-                        except requests.exceptions.HTTPError as e:
-                            if e.response.status_code == 403:
-                                print(f"403 Forbidden error for {link}, skipping...")
-                            else:
-                                print(f"HTTP error {e.response.status_code} for {link}, skipping...")
-                        except requests.exceptions.RequestException as e:
-                            print(f"Error fetching or processing {link}: {e}")
-                    else:
-                        print("No link found in result.")
-                start += len(result_block)
-                attempts += 1
-            except requests.exceptions.RequestException as e:
-                print(f"Error fetching search results: {e}")
-                attempts += 1
-    print(f"Total results fetched: {len(all_results)}")
-    return all_results
 def extract_text_from_webpage(html_content):
-    """Extract visible text from HTML content"""
-    soup = BeautifulSoup(html_content, 'html.parser')
-    # Remove script and style elements
-    for script in soup(["script", "style"]):
-        script.decompose()
-    # Get text
-    text = soup.get_text()
-    # Break into lines and remove leading and trailing space on each
-    lines = (line.strip() for line in text.splitlines())
-    # Break multi-headlines into a line each
-    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    # Drop blank lines
-    text = '\n'.join(chunk for chunk in chunks if chunk)
-    return text
 def filter_relevant_content(text):
-    """Filter out irrelevant content"""
-    # List of keywords related to financial reports
-    keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
-    # Split the text into sentences
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    # Filter sentences containing at least one keyword
-    relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
-    # Join the relevant sentences back into a single string
-    filtered_text = ' '.join(relevant_sentences)
-    return filtered_text
 def chunk_text(text, max_chunk_size=1000, overlap=100):
-    # List of keywords that might indicate new sections
-    section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
-    # Split text into sentences
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) > max_chunk_size:
-            # If adding this sentence exceeds max_chunk_size, start a new chunk
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-        elif any(keyword in sentence.lower() for keyword in section_keywords):
-            # If sentence contains a section keyword, start a new chunk
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence + " "
-        else:
-            current_chunk += sentence + " "
-    # Add the last chunk if it's not empty
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    # Add overlap
-    overlapped_chunks = []
-    for i, chunk in enumerate(chunks):
-        if i > 0:
-            chunk = chunks[i-1][-overlap:] + chunk
-        if i < len(chunks) - 1:
-            chunk = chunk + chunks[i+1][:overlap]
-        overlapped_chunks.append(chunk)
-    return overlapped_chunks
-def summarize_financial_news(query):
-    """Search for financial news, extract relevant content, and summarize"""
-    search_results = google_search(query, num_results=1)
-    all_filtered_text = ""
-    for result in search_results:
-        if result['text']:
-            filtered_text = filter_relevant_content(result['text'])
-            all_filtered_text += filtered_text + "\n\n"
-    if not all_filtered_text:
-        return "No relevant financial information found."
-    # Chunk the filtered text
-    chunks = chunk_text(all_filtered_text, max_chunk_size=3000, overlap=200)
-    summaries = []
-    for chunk in chunks:
-        prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
 {chunk}
-Provide a detailed, coherent summary focusing on financial implications and analysis."""
-        summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
-        if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
-            summaries.append(summary[0]['generated_text'])
-    # Combine summaries
-    combined_summary = "\n\n".join(summaries)
-    # Final summarization of combined summaries
-    final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
 {combined_summary}
 Focus on the most important financial implications and analysis."""
-    final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
-    if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
-        return final_summary[0]['generated_text']
-    else:
-        return "Unable to generate summary due to an error."
 # Gradio Interface
 iface = gr.Interface(
-    fn=summarize_financial_news,
-    inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
-    outputs="text",
-    title="Financial News Summarizer",
-    description="Enter a company name or financial topic to get a summary of recent financial news."
 )
-iface.launch()

 from datetime import datetime, timedelta
 import re
 import os
+import PyPDF2
 # List of user agents to rotate through
 _useragent_list = [
+   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
 ]
 API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
 headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
 def query_llama(payload):
+   """Send a query to the Llama model via Hugging Face API"""
+   try:
+       print(f"Payload: {payload}")  # Debug: Print payload
+       response = requests.post(API_URL, headers=headers, json=payload)
+       response.raise_for_status()
+       return response.json()
+   except requests.exceptions.RequestException as e:
+       print(f"Error querying Llama model: {e}")
+       return None
 def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90):
+   """Perform a Google search and return results"""
+   print(f"Searching for term: {term}")
+   # Calculate the date range
+   end_date = datetime.now()
+   start_date = end_date - timedelta(days=days_back)
+   # Format dates as strings
+   start_date_str = start_date.strftime("%Y-%m-%d")
+   end_date_str = end_date.strftime("%Y-%m-%d")
+   # Add the date range to the search term
+   search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
+   escaped_term = urllib.parse.quote_plus(search_term)
+   start = 0
+   all_results = []
+   max_attempts = num_results * 2  # Allow for some failed attempts
+   with requests.Session() as session:
+       attempts = 0
+       while len(all_results) < num_results and attempts < max_attempts:
+           try:
+               # Choose a random user agent
+               user_agent = random.choice(_useragent_list)
+               headers = {'User-Agent': user_agent}
+               resp = session.get(
+                   url="https://www.google.com/search",
+                   headers=headers,
+                   params={
+                       "q": search_term,
+                       "num": num_results - len(all_results),
+                       "hl": lang,
+                       "start": start,
+                       "safe": safe,
+                   },
+                   timeout=timeout,
+                   verify=ssl_verify,
+               )
+               resp.raise_for_status()
+               soup = BeautifulSoup(resp.text, "html.parser")
+               result_block = soup.find_all("div", attrs={"class": "g"})
+               if not result_block:
+                   print("No more results found.")
+                   break
+               for result in result_block:
+                   if len(all_results) >= num_results:
+                       break
+                   link = result.find("a", href=True)
+                   if link:
+                       link = link["href"]
+                       print(f"Found link: {link}")
+                       try:
+                           webpage = session.get(link, headers=headers, timeout=timeout)
+                           webpage.raise_for_status()
+                           visible_text = extract_text_from_webpage(webpage.text)
+                           all_results.append({"link": link, "text": visible_text})
+                       except requests.exceptions.HTTPError as e:
+                           if e.response.status_code == 403:
+                               print(f"403 Forbidden error for {link}, skipping...")
+                           else:
+                               print(f"HTTP error {e.response.status_code} for {link}, skipping...")
+                       except requests.exceptions.RequestException as e:
+                           print(f"Error fetching or processing {link}: {e}")
+                   else:
+                       print("No link found in result.")
+               start += len(result_block)
+               attempts += 1
+           except requests.exceptions.RequestException as e:
+               print(f"Error fetching search results: {e}")
+               attempts += 1
+   print(f"Total results fetched: {len(all_results)}")
+   return all_results
 def extract_text_from_webpage(html_content):
+   """Extract visible text from HTML content"""
+   soup = BeautifulSoup(html_content, 'html.parser')
+   # Remove script and style elements
+   for script in soup(["script", "style"]):
+       script.decompose()
+   # Get text
+   text = soup.get_text()
+   # Break into lines and remove leading and trailing space on each
+   lines = (line.strip() for line in text.splitlines())
+   # Break multi-headlines into a line each
+   chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+   # Drop blank lines
+   text = '\n'.join(chunk for chunk in chunks if chunk)
+   return text
 def filter_relevant_content(text):
+   """Filter out irrelevant content"""
+   # List of keywords related to financial reports
+   keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
+   # Split the text into sentences
+   sentences = re.split(r'(?<=[.!?])\s+', text)
+   # Filter sentences containing at least one keyword
+   relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
+   # Join the relevant sentences back into a single string
+   filtered_text = ' '.join(relevant_sentences)
+   return filtered_text
 def chunk_text(text, max_chunk_size=1000, overlap=100):
+   # List of keywords that might indicate new sections
+   section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
+   # Split text into sentences
+   sentences = re.split(r'(?<=[.!?])\s+', text)
+   chunks = []
+   current_chunk = ""
+   for sentence in sentences:
+       if len(current_chunk) + len(sentence) > max_chunk_size:
+           # If adding this sentence exceeds max_chunk_size, start a new chunk
+           chunks.append(current_chunk.strip())
+           current_chunk = sentence + " "
+       elif any(keyword in sentence.lower() for keyword in section_keywords):
+           # If sentence contains a section keyword, start a new chunk
+           if current_chunk:
+               chunks.append(current_chunk.strip())
+           current_chunk = sentence + " "
+       else:
+           current_chunk += sentence + " "
+   # Add the last chunk if it's not empty
+   if current_chunk:
+       chunks.append(current_chunk.strip())
+   # Add overlap
+   overlapped_chunks = []
+   for i, chunk in enumerate(chunks):
+       if i > 0:
+           chunk = chunks[i-1][-overlap:] + chunk
+       if i < len(chunks) - 1:
+           chunk = chunk + chunks[i+1][:overlap]
+       overlapped_chunks.append(chunk)
+   return overlapped_chunks
+def summarize_text(text, context_instructions):
+   chunks = chunk_text(text, max_chunk_size=3000, overlap=200)
+   summaries = []
+   for chunk in chunks:
+       prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
 {chunk}
+{context_instructions}"""
+       summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
+       if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
+           summaries.append(summary[0]['generated_text'])
+   # Combine summaries
+   combined_summary = "\n\n".join(summaries)
+   # Final summarization of combined summaries
+   final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
 {combined_summary}
 Focus on the most important financial implications and analysis."""
+   final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
+   if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
+       return final_summary[0]['generated_text']
+   else:
+       return "Unable to generate summary due to an error."
+def summarize_financial_news(query, read_pdf=False, pdf=None):
+   """Search for financial news, extract relevant content
+, and summarize"""
+   all_filtered_text = ""
+   if read_pdf and pdf is not None:
+       pdf_text = extract_text_from_pdf(pdf)
+       all_filtered_text += pdf_text + "\n\n"
+   else:
+       search_results = google_search(query, num_results=1)
+       for result in search_results:
+           if result['text']:
+               filtered_text = filter_relevant_content(result['text'])
+               all_filtered_text += filtered_text + "\n\n"
+   if not all_filtered_text:
+       return "No relevant financial information found."
+   context_instructions = "Provide a detailed, coherent summary focusing on financial implications and analysis."
+   return summarize_text(all_filtered_text, context_instructions)
+def extract_text_from_pdf(pdf):
+   """Extract text from each page of the PDF"""
+   reader = PyPDF2.PdfFileReader(pdf)
+   text = ""
+   for page_num in range(reader.getNumPages()):
+       page = reader.getPage(page_num)
+       text += page.extract_text() + "\n"
+   return text
 # Gradio Interface
+def interface_function(query, read_pdf, pdf):
+   return summarize_financial_news(query, read_pdf, pdf)
 iface = gr.Interface(
+   fn=interface_function,
+   inputs=[
+       gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
+       gr.Checkbox(label="Read PDF"),
+       gr.File(label="Upload PDF", type="file")
+   ],
+   outputs="text",
+   title="Financial News Summarizer",
+   description="Enter a company name or financial topic to get a summary of recent financial news. Optionally, upload a PDF to summarize its content."
 )
+iface.launch()