SearchGPT

Running

App Files Files Community

Shreyas094 commited on Jul 1, 2024

Commit

5090140

verified ·

1 Parent(s): f92e80c

Update app.py

Browse files

Files changed (1) hide show

app.py +237 -112

app.py CHANGED Viewed

@@ -1,118 +1,243 @@
-import os
 import gradio as gr
-from PyPDF2 import PdfReader
 import requests
 from dotenv import load_dotenv
-from transformers import AutoTokenizer
-# Load environment variables
-load_dotenv()
-# Get the Hugging Face API token
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
-def summarize_text(text, instructions, agent_name, max_length, temperature, repetition_penalty, top_p):
-   print(f"{agent_name}: Starting summarization")
-   API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
-   headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
-   summaries = []
-   current_text = text
-   while len(current_text) > 0:
-       payload = {
-           "inputs": f"{instructions}\n\nText to summarize:\n{current_text}",
-           "parameters": {
-               "max_length": max_length,
-               "temperature": temperature,
-               "repetition_penalty": repetition_penalty,
-               "top_p": top_p
-           }
-       }
-       print(f"{agent_name}: Sending request to API")
-       response = requests.post(API_URL, headers=headers, json=payload)
-       print(f"{agent_name}: Received response from API")
-       generated_text = response.json()[0]["generated_text"]
-       # Split the generated text by the delimiter "\n\n" and take the last part as the summary
-       summary = generated_text.split("\n\n")[-1]
-       summaries.append(summary)
-       # Remove the summarized part from the current text
-       current_text = current_text[len(summary):].strip()
-   # Join all partial summaries into a final summary
-   final_summary = "\n\n".join(summaries)
-   return final_summary
-def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
-   print("Starting PDF processing")
-   # Read PDF
-   reader = PdfReader(pdf_file)
-   text = ""
-   for page in reader.pages:
-       text += page.extract_text() + "\n\n"
-   print(f"Extracted {len(reader.pages)} pages from PDF")
-   # Chunk the text (simple splitting by pages for this example)
-   chunks = text.split("\n\n")
-   print(f"Split text into {len(chunks)} chunks")
-   # Agent 1: Summarize each chunk
-   agent1_summaries = []
-   for i, chunk in enumerate(chunks):
-       print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
-       summary = summarize_text(chunk, chunk_instructions, "Agent 1", max_length, temperature, repetition_penalty, top_p)
-       agent1_summaries.append(summary)
-   print("Agent 1: Finished processing all chunks")
-   # Concatenate Agent 1 summaries
-   concatenated_summary = "\n\n".join(agent1_summaries)
-   print(f"Concatenated Agent 1 summaries (length: {count_tokens(concatenated_summary)} tokens)")
-   print(f"Concatenated Summary: {concatenated_summary}")
-   # Sliding window approach
-   window_size = 3500  # in tokens
-   step_size = 3000  # overlap of 500 tokens
-   windows = []
-   current_position = 0
-   while current_position < len(concatenated_summary):
-       window_end = current_position
-       window_text = ""
-       while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
-           window_text += concatenated_summary[window_end]
-           window_end += 1
-       windows.append(window_text)
-       current_position += step_size
-   print(f"Created {len(windows)} windows for intermediate summarization")
-   # Intermediate summarization
-   intermediate_summaries = []
-   for i, window in enumerate(windows):
-       print(f"Processing window {i+1}/{len(windows)}")
-       summary = summarize_text(window, window_instructions, f"Window {i+1}", max_length, temperature, repetition_penalty, top_p)
-       intermediate_summaries.append(summary)
-   # Final summarization
-   final_input = "\n\n".join(intermediate_summaries)
-   print(f"Final input length: {count_tokens(final_input)} tokens")
-   final_summary = summarize_text(final_input, final_instructions, "Agent 2", max_length, temperature, repetition_penalty, top_p)
-   print("Agent 2: Finished final summarization")
-   return final_summary
-def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
-   if pdf_file is None:
-       print("Error: No PDF file uploaded")
-       return "Please upload a PDF file."
    try:
-       print(f"Starting summarization process for file: {pdf_file.name}")
-       summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p)
-       print("Summarization process completed successfully")
-       return summary
    except Exception as e:
-       print(f"An error occurred: {str(e)}")
-       return f"An error occurred: {str(e)}"
-# Gradio interface
-iface = gr.Interface(
-   fn=pdf_summarizer,
-   inputs=[
-       gr.File(label="Upload PDF"),
-       gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
-       gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
-       gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization"),
-       gr.Slider(label="Max Length", minimum=500, maximum=3500, step=100, value=2000),
-       gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.1, value=0.7),
-       gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.1),
-       gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
-   ],
-   outputs=gr.Textbox(label="Summary"),
-   title="PDF Earnings Summary Generator",
-   description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
-)
-print("Launching Gradio interface")
-iface.launch()

+import fitz  # PyMuPDF
 import gradio as gr
 import requests
+from bs4 import BeautifulSoup
+import urllib.parse
+import random
+import os
 from dotenv import load_dotenv
+import shutil
+import tempfile
+load_dotenv()  # Load environment variables from .env file
+# Now replace the hard-coded token with the environment variable
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+def clear_cache():
    try:
+       # Clear Gradio cache
+       cache_dir = tempfile.gettempdir()
+       shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
+       # Clear any custom cache you might have
+       # For example, if you're caching PDF files or search results:
+       if os.path.exists("output_summary.pdf"):
+           os.remove("output_summary.pdf")
+       # Add any other cache clearing operations here
+       print("Cache cleared successfully.")
+       return "Cache cleared successfully."
    except Exception as e:
+       print(f"Error clearing cache: {e}")
+       return f"Error clearing cache: {e}"
+_useragent_list = [
+   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+]
+# Function to extract visible text from HTML content of a webpage
+def extract_text_from_webpage(html):
+   print("Extracting text from webpage...")
+   soup = BeautifulSoup(html, 'html.parser')
+   for script in soup(["script", "style"]):
+       script.extract()  # Remove scripts and styles
+   text = soup.get_text()
+   lines = (line.strip() for line in text.splitlines())
+   chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+   text = '\n'.join(chunk for chunk in chunks if chunk)
+   print(f"Extracted text length: {len(text)}")
+   return text
+# Function to perform a Google search and retrieve results
+def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
+   """Performs a Google search and returns the results."""
+   print(f"Searching for term: {term}")
+   escaped_term = urllib.parse.quote_plus(term)
+   start = 0
+   all_results = []
+   max_chars_per_page = 8000  # Limit the number of characters from each webpage to stay under the token limit
+   with requests.Session() as session:
+       while start < num_results:
+           print(f"Fetching search results starting from: {start}")
+           try:
+               # Choose a random user agent
+               user_agent = random.choice(_useragent_list)
+               headers = {
+                   'User-Agent': user_agent
+               }
+               print(f"Using User-Agent: {headers['User-Agent']}")
+               resp = session.get(
+                   url="https://www.google.com/search",
+                   headers=headers,
+                   params={
+                       "q": term,
+                       "num": num_results - start,
+                       "hl": lang,
+                       "start": start,
+                       "safe": safe,
+                   },
+                   timeout=timeout,
+                   verify=ssl_verify,
+               )
+               resp.raise_for_status()
+           except requests.exceptions.RequestException as e:
+               print(f"Error fetching search results: {e}")
+               break
+           soup = BeautifulSoup(resp.text, "html.parser")
+           result_block = soup.find_all("div", attrs={"class": "g"})
+           if not result_block:
+               print("No more results found.")
+               break
+           for result in result_block:
+               link = result.find("a", href=True)
+               if link:
+                   link = link["href"]
+                   print(f"Found link: {link}")
+                   try:
+                       webpage = session.get(link, headers=headers, timeout=timeout)
+                       webpage.raise_for_status()
+                       visible_text = extract_text_from_webpage(webpage.text)
+                       if len(visible_text) > max_chars_per_page:
+                           visible_text = visible_text[:max_chars_per_page] + "..."
+                       all_results.append({"link": link, "text": visible_text})
+                   except requests.exceptions.RequestException as e:
+                       print(f"Error fetching or processing {link}: {e}")
+                       all_results.append({"link": link, "text": None})
+               else:
+                   print("No link found in result.")
+                   all_results.append({"link": None, "text": None})
+           start += len(result_block)
+   print(f"Total results fetched: {len(all_results)}")
+   return all_results
+# Function to format the prompt for the Hugging Face API
+def format_prompt(query, search_results, instructions):
+   formatted_results = ""
+   for result in search_results:
+       link = result["link"]
+       text = result["text"]
+       if link:
+           formatted_results += f"URL: {link}\nContent: {text}\n{'-' * 80}\n"
+       else:
+           formatted_results += "No link found.\n" + '-' * 80 + '\n'
+   prompt = f"{instructions}User Query: {query}\n\nWeb Search Results:\n{formatted_results}\n\nAssistant:"
+   return prompt
+# Function to generate text using Hugging Face API
+def generate_text(input_text, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
+   print("Generating text using Hugging Face API...")
+   endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
+   headers = {
+       "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}",  # Use the environment variable
+       "Content-Type": "application/json"
+   }
+   data = {
+       "inputs": input_text,
+       "parameters": {
+           "max_new_tokens": 8000,  # Adjust as needed
+           "temperature": temperature,
+           "repetition_penalty": repetition_penalty,
+           "top_p": top_p
+       }
+   }
+   try:
+       response = requests.post(endpoint, headers=headers, json=data)
+       response.raise_for_status()
+       # Check if response is JSON
+       try:
+           json_data = response.json()
+       except ValueError:
+           print("Response is not JSON.")
+           return None
+       # Extract generated text from response JSON
+       if isinstance(json_data, list):
+           # Handle list response (if applicable for your use case)
+           generated_text = json_data[0].get("generated_text") if json_data else None
+       elif isinstance(json_data, dict):
+           # Handle dictionary response
+           generated_text = json_data.get("generated_text")
+       else:
+           print("Unexpected response format.")
+           return None
+       if generated_text is not None:
+           print("Text generation complete using Hugging Face API.")
+           print(f"Generated text: {generated_text}")  # Debugging line
+           return generated_text
+       else:
+           print("Generated text not found in response.")
+           return None
+   except requests.exceptions.RequestException as e:
+       print(f"Error generating text using Hugging Face API: {e}")
+       return None
+# Function to read and extract text from a PDF
+def read_pdf(file_obj):
+   with fitz.open(file_obj.name) as document:
+       text = ""
+       for page_num in range(document.page_count):
+           page = document.load_page(page_num)
+           text += page.get_text()
+       return text
+# Function to format the prompt with instructions for text generation
+def format_prompt_with_instructions(text, instructions):
+   prompt = f"{instructions}{text}\n\nAssistant:"
+   return prompt
+# Function to save text to a PDF
+def save_text_to_pdf(text, output_path):
+   print(f"Saving text to PDF at {output_path}...")
+   doc = fitz.open()  # Create a new PDF document
+   page = doc.new_page()  # Create a new page
+   # Set the page margins
+   margin = 50  # 50 points margin
+   page_width = page.rect.width
+   page_height = page.rect.height
+   text_width = page_width - 2 * margin
+   text_height = page_height - 2 * margin
+   # Define font size and line spacing
+   font_size = 9
+   line_spacing = 1 * font_size
+   max_lines_per_page = int(text_height // line_spacing)
+   # Load a built-in font
+   font = "helv"
+   # Split the text into lines
+   lines = text.split("\n")
+   current_line = 0
+   for line in lines:
+       if current_line >= max_lines_per_page:
+           page = doc.new_page()  # Add a new page
+           current_line = 0
+       rect = fitz.Rect(margin, margin + current_line * line_spacing, text_width, margin + (current_line + 1) * line_spacing)
+       page.insert_textbox(rect, line, fontsize=font_size, fontname=font, align=fitz.TEXT_ALIGN_LEFT)
+       current_line += 1
+   doc.save(output_path)
+   print(f"Text saved to PDF at {output_path}.")
+# Function to handle user queries
+def handle_query(query, is_read_pdf, instructions):
+   print("Handling user query...")
+   max_chars_per_chunk = 1000  # Adjust this value as needed to control chunk size
+   if is_read_pdf:
+       pdf_text = read_pdf(query)
+       text_chunks = [pdf_text[i:i+max_chars_per_chunk] for i in range(0, len(pdf_text), max_chars_per_chunk)]
+   else:
+       search_results = google_search(query)
+       text_chunks = []
+       for result in search_results:
+           if result["text"]:
+               text_chunks.extend([result["text"][i:i+max_chars_per_chunk] for i in range(0, len(result["text"]), max_chars_per_chunk)])
+   summaries = []
+   for chunk in text_chunks:
+       formatted_prompt = format_prompt_with_instructions(chunk, instructions)
+       summary = generate_text(formatted_prompt)
+       if summary:
+           summaries.append(summary)
+   combined_summary = " ".join(summaries)
+   save_text_to_pdf(combined_summary, "output_summary.pdf")
+   return combined_summary
+def run_app():
+   with gr.Blocks() as demo:
+       gr.Markdown("# Web and PDF Summarizer")
+       query = gr.Textbox(label="Enter your query or upload a PDF", placeholder="Enter query here")
+       is_read_pdf = gr.Checkbox(label="Read PDF", value=False)
+       instructions = gr.Textbox(label="Enter instructions", placeholder="Enter instructions here")
+       output = gr.Textbox(label="Summary")
+       clear_cache_btn = gr.Button("Clear Cache")
+       clear_cache_btn.click(fn=clear_cache, outputs=output)
+       generate_btn = gr.Button("Generate Summary")
+       generate_btn.click(fn=handle_query, inputs=[query, is_read_pdf, instructions], outputs=output)
+   demo.launch()
+run_app()