Shreyas094 commited on
Commit
5090140
·
verified ·
1 Parent(s): f92e80c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -112
app.py CHANGED
@@ -1,118 +1,243 @@
1
- import os
2
  import gradio as gr
3
- from PyPDF2 import PdfReader
4
  import requests
 
 
 
 
5
  from dotenv import load_dotenv
6
- from transformers import AutoTokenizer
7
- # Load environment variables
8
- load_dotenv()
9
- # Get the Hugging Face API token
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
11
-
12
- def summarize_text(text, instructions, agent_name, max_length, temperature, repetition_penalty, top_p):
13
- print(f"{agent_name}: Starting summarization")
14
- API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
15
- headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
16
- summaries = []
17
- current_text = text
18
- while len(current_text) > 0:
19
- payload = {
20
- "inputs": f"{instructions}\n\nText to summarize:\n{current_text}",
21
- "parameters": {
22
- "max_length": max_length,
23
- "temperature": temperature,
24
- "repetition_penalty": repetition_penalty,
25
- "top_p": top_p
26
- }
27
- }
28
- print(f"{agent_name}: Sending request to API")
29
- response = requests.post(API_URL, headers=headers, json=payload)
30
- print(f"{agent_name}: Received response from API")
31
- generated_text = response.json()[0]["generated_text"]
32
- # Split the generated text by the delimiter "\n\n" and take the last part as the summary
33
- summary = generated_text.split("\n\n")[-1]
34
- summaries.append(summary)
35
- # Remove the summarized part from the current text
36
- current_text = current_text[len(summary):].strip()
37
- # Join all partial summaries into a final summary
38
- final_summary = "\n\n".join(summaries)
39
- return final_summary
40
- def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
41
- print("Starting PDF processing")
42
- # Read PDF
43
- reader = PdfReader(pdf_file)
44
- text = ""
45
- for page in reader.pages:
46
- text += page.extract_text() + "\n\n"
47
- print(f"Extracted {len(reader.pages)} pages from PDF")
48
- # Chunk the text (simple splitting by pages for this example)
49
- chunks = text.split("\n\n")
50
- print(f"Split text into {len(chunks)} chunks")
51
- # Agent 1: Summarize each chunk
52
- agent1_summaries = []
53
- for i, chunk in enumerate(chunks):
54
- print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
55
- summary = summarize_text(chunk, chunk_instructions, "Agent 1", max_length, temperature, repetition_penalty, top_p)
56
- agent1_summaries.append(summary)
57
- print("Agent 1: Finished processing all chunks")
58
- # Concatenate Agent 1 summaries
59
- concatenated_summary = "\n\n".join(agent1_summaries)
60
- print(f"Concatenated Agent 1 summaries (length: {count_tokens(concatenated_summary)} tokens)")
61
- print(f"Concatenated Summary: {concatenated_summary}")
62
- # Sliding window approach
63
- window_size = 3500 # in tokens
64
- step_size = 3000 # overlap of 500 tokens
65
- windows = []
66
- current_position = 0
67
- while current_position < len(concatenated_summary):
68
- window_end = current_position
69
- window_text = ""
70
- while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
71
- window_text += concatenated_summary[window_end]
72
- window_end += 1
73
- windows.append(window_text)
74
- current_position += step_size
75
- print(f"Created {len(windows)} windows for intermediate summarization")
76
- # Intermediate summarization
77
- intermediate_summaries = []
78
- for i, window in enumerate(windows):
79
- print(f"Processing window {i+1}/{len(windows)}")
80
- summary = summarize_text(window, window_instructions, f"Window {i+1}", max_length, temperature, repetition_penalty, top_p)
81
- intermediate_summaries.append(summary)
82
- # Final summarization
83
- final_input = "\n\n".join(intermediate_summaries)
84
- print(f"Final input length: {count_tokens(final_input)} tokens")
85
- final_summary = summarize_text(final_input, final_instructions, "Agent 2", max_length, temperature, repetition_penalty, top_p)
86
- print("Agent 2: Finished final summarization")
87
- return final_summary
88
- def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p):
89
- if pdf_file is None:
90
- print("Error: No PDF file uploaded")
91
- return "Please upload a PDF file."
92
  try:
93
- print(f"Starting summarization process for file: {pdf_file.name}")
94
- summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions, max_length, temperature, repetition_penalty, top_p)
95
- print("Summarization process completed successfully")
96
- return summary
 
 
 
 
 
 
97
  except Exception as e:
98
- print(f"An error occurred: {str(e)}")
99
- return f"An error occurred: {str(e)}"
100
- # Gradio interface
101
- iface = gr.Interface(
102
- fn=pdf_summarizer,
103
- inputs=[
104
- gr.File(label="Upload PDF"),
105
- gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
106
- gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
107
- gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization"),
108
- gr.Slider(label="Max Length", minimum=500, maximum=3500, step=100, value=2000),
109
- gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.1, value=0.7),
110
- gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.1, value=1.1),
111
- gr.Slider(label="Top P", minimum=0.1, maximum=1.0, step=0.1, value=0.9)
112
- ],
113
- outputs=gr.Textbox(label="Summary"),
114
- title="PDF Earnings Summary Generator",
115
- description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
116
- )
117
- print("Launching Gradio interface")
118
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
  import gradio as gr
 
3
  import requests
4
+ from bs4 import BeautifulSoup
5
+ import urllib.parse
6
+ import random
7
+ import os
8
  from dotenv import load_dotenv
9
+ import shutil
10
+ import tempfile
11
+ load_dotenv() # Load environment variables from .env file
12
+ # Now replace the hard-coded token with the environment variable
13
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
14
+ def clear_cache():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  try:
16
+ # Clear Gradio cache
17
+ cache_dir = tempfile.gettempdir()
18
+ shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
19
+ # Clear any custom cache you might have
20
+ # For example, if you're caching PDF files or search results:
21
+ if os.path.exists("output_summary.pdf"):
22
+ os.remove("output_summary.pdf")
23
+ # Add any other cache clearing operations here
24
+ print("Cache cleared successfully.")
25
+ return "Cache cleared successfully."
26
  except Exception as e:
27
+ print(f"Error clearing cache: {e}")
28
+ return f"Error clearing cache: {e}"
29
+ _useragent_list = [
30
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
31
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
32
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
33
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
34
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
35
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
36
+ ]
37
+ # Function to extract visible text from HTML content of a webpage
38
+ def extract_text_from_webpage(html):
39
+ print("Extracting text from webpage...")
40
+ soup = BeautifulSoup(html, 'html.parser')
41
+ for script in soup(["script", "style"]):
42
+ script.extract() # Remove scripts and styles
43
+ text = soup.get_text()
44
+ lines = (line.strip() for line in text.splitlines())
45
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
46
+ text = '\n'.join(chunk for chunk in chunks if chunk)
47
+ print(f"Extracted text length: {len(text)}")
48
+ return text
49
+ # Function to perform a Google search and retrieve results
50
+ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
51
+ """Performs a Google search and returns the results."""
52
+ print(f"Searching for term: {term}")
53
+ escaped_term = urllib.parse.quote_plus(term)
54
+ start = 0
55
+ all_results = []
56
+ max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
57
+ with requests.Session() as session:
58
+ while start < num_results:
59
+ print(f"Fetching search results starting from: {start}")
60
+ try:
61
+ # Choose a random user agent
62
+ user_agent = random.choice(_useragent_list)
63
+ headers = {
64
+ 'User-Agent': user_agent
65
+ }
66
+ print(f"Using User-Agent: {headers['User-Agent']}")
67
+ resp = session.get(
68
+ url="https://www.google.com/search",
69
+ headers=headers,
70
+ params={
71
+ "q": term,
72
+ "num": num_results - start,
73
+ "hl": lang,
74
+ "start": start,
75
+ "safe": safe,
76
+ },
77
+ timeout=timeout,
78
+ verify=ssl_verify,
79
+ )
80
+ resp.raise_for_status()
81
+ except requests.exceptions.RequestException as e:
82
+ print(f"Error fetching search results: {e}")
83
+ break
84
+ soup = BeautifulSoup(resp.text, "html.parser")
85
+ result_block = soup.find_all("div", attrs={"class": "g"})
86
+ if not result_block:
87
+ print("No more results found.")
88
+ break
89
+ for result in result_block:
90
+ link = result.find("a", href=True)
91
+ if link:
92
+ link = link["href"]
93
+ print(f"Found link: {link}")
94
+ try:
95
+ webpage = session.get(link, headers=headers, timeout=timeout)
96
+ webpage.raise_for_status()
97
+ visible_text = extract_text_from_webpage(webpage.text)
98
+ if len(visible_text) > max_chars_per_page:
99
+ visible_text = visible_text[:max_chars_per_page] + "..."
100
+ all_results.append({"link": link, "text": visible_text})
101
+ except requests.exceptions.RequestException as e:
102
+ print(f"Error fetching or processing {link}: {e}")
103
+ all_results.append({"link": link, "text": None})
104
+ else:
105
+ print("No link found in result.")
106
+ all_results.append({"link": None, "text": None})
107
+ start += len(result_block)
108
+ print(f"Total results fetched: {len(all_results)}")
109
+ return all_results
110
+ # Function to format the prompt for the Hugging Face API
111
+ def format_prompt(query, search_results, instructions):
112
+ formatted_results = ""
113
+ for result in search_results:
114
+ link = result["link"]
115
+ text = result["text"]
116
+ if link:
117
+ formatted_results += f"URL: {link}\nContent: {text}\n{'-' * 80}\n"
118
+ else:
119
+ formatted_results += "No link found.\n" + '-' * 80 + '\n'
120
+ prompt = f"{instructions}User Query: {query}\n\nWeb Search Results:\n{formatted_results}\n\nAssistant:"
121
+ return prompt
122
+ # Function to generate text using Hugging Face API
123
+ def generate_text(input_text, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
124
+ print("Generating text using Hugging Face API...")
125
+ endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
126
+ headers = {
127
+ "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}", # Use the environment variable
128
+ "Content-Type": "application/json"
129
+ }
130
+ data = {
131
+ "inputs": input_text,
132
+ "parameters": {
133
+ "max_new_tokens": 8000, # Adjust as needed
134
+ "temperature": temperature,
135
+ "repetition_penalty": repetition_penalty,
136
+ "top_p": top_p
137
+ }
138
+ }
139
+ try:
140
+ response = requests.post(endpoint, headers=headers, json=data)
141
+ response.raise_for_status()
142
+ # Check if response is JSON
143
+ try:
144
+ json_data = response.json()
145
+ except ValueError:
146
+ print("Response is not JSON.")
147
+ return None
148
+ # Extract generated text from response JSON
149
+ if isinstance(json_data, list):
150
+ # Handle list response (if applicable for your use case)
151
+ generated_text = json_data[0].get("generated_text") if json_data else None
152
+ elif isinstance(json_data, dict):
153
+ # Handle dictionary response
154
+ generated_text = json_data.get("generated_text")
155
+ else:
156
+ print("Unexpected response format.")
157
+ return None
158
+ if generated_text is not None:
159
+ print("Text generation complete using Hugging Face API.")
160
+ print(f"Generated text: {generated_text}") # Debugging line
161
+ return generated_text
162
+ else:
163
+ print("Generated text not found in response.")
164
+ return None
165
+ except requests.exceptions.RequestException as e:
166
+ print(f"Error generating text using Hugging Face API: {e}")
167
+ return None
168
+ # Function to read and extract text from a PDF
169
+ def read_pdf(file_obj):
170
+ with fitz.open(file_obj.name) as document:
171
+ text = ""
172
+ for page_num in range(document.page_count):
173
+ page = document.load_page(page_num)
174
+ text += page.get_text()
175
+ return text
176
+ # Function to format the prompt with instructions for text generation
177
+ def format_prompt_with_instructions(text, instructions):
178
+ prompt = f"{instructions}{text}\n\nAssistant:"
179
+ return prompt
180
+ # Function to save text to a PDF
181
+ def save_text_to_pdf(text, output_path):
182
+ print(f"Saving text to PDF at {output_path}...")
183
+ doc = fitz.open() # Create a new PDF document
184
+ page = doc.new_page() # Create a new page
185
+ # Set the page margins
186
+ margin = 50 # 50 points margin
187
+ page_width = page.rect.width
188
+ page_height = page.rect.height
189
+ text_width = page_width - 2 * margin
190
+ text_height = page_height - 2 * margin
191
+ # Define font size and line spacing
192
+ font_size = 9
193
+ line_spacing = 1 * font_size
194
+ max_lines_per_page = int(text_height // line_spacing)
195
+ # Load a built-in font
196
+ font = "helv"
197
+ # Split the text into lines
198
+ lines = text.split("\n")
199
+ current_line = 0
200
+ for line in lines:
201
+ if current_line >= max_lines_per_page:
202
+ page = doc.new_page() # Add a new page
203
+ current_line = 0
204
+ rect = fitz.Rect(margin, margin + current_line * line_spacing, text_width, margin + (current_line + 1) * line_spacing)
205
+ page.insert_textbox(rect, line, fontsize=font_size, fontname=font, align=fitz.TEXT_ALIGN_LEFT)
206
+ current_line += 1
207
+ doc.save(output_path)
208
+ print(f"Text saved to PDF at {output_path}.")
209
+ # Function to handle user queries
210
+ def handle_query(query, is_read_pdf, instructions):
211
+ print("Handling user query...")
212
+ max_chars_per_chunk = 1000 # Adjust this value as needed to control chunk size
213
+ if is_read_pdf:
214
+ pdf_text = read_pdf(query)
215
+ text_chunks = [pdf_text[i:i+max_chars_per_chunk] for i in range(0, len(pdf_text), max_chars_per_chunk)]
216
+ else:
217
+ search_results = google_search(query)
218
+ text_chunks = []
219
+ for result in search_results:
220
+ if result["text"]:
221
+ text_chunks.extend([result["text"][i:i+max_chars_per_chunk] for i in range(0, len(result["text"]), max_chars_per_chunk)])
222
+ summaries = []
223
+ for chunk in text_chunks:
224
+ formatted_prompt = format_prompt_with_instructions(chunk, instructions)
225
+ summary = generate_text(formatted_prompt)
226
+ if summary:
227
+ summaries.append(summary)
228
+ combined_summary = " ".join(summaries)
229
+ save_text_to_pdf(combined_summary, "output_summary.pdf")
230
+ return combined_summary
231
+ def run_app():
232
+ with gr.Blocks() as demo:
233
+ gr.Markdown("# Web and PDF Summarizer")
234
+ query = gr.Textbox(label="Enter your query or upload a PDF", placeholder="Enter query here")
235
+ is_read_pdf = gr.Checkbox(label="Read PDF", value=False)
236
+ instructions = gr.Textbox(label="Enter instructions", placeholder="Enter instructions here")
237
+ output = gr.Textbox(label="Summary")
238
+ clear_cache_btn = gr.Button("Clear Cache")
239
+ clear_cache_btn.click(fn=clear_cache, outputs=output)
240
+ generate_btn = gr.Button("Generate Summary")
241
+ generate_btn.click(fn=handle_query, inputs=[query, is_read_pdf, instructions], outputs=output)
242
+ demo.launch()
243
+ run_app()