Shreyas094 commited on
Commit
28ed44f
1 Parent(s): 6228a67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -342
app.py CHANGED
@@ -1,343 +1,90 @@
1
- import fitz # PyMuPDF
2
- import gradio as gr
3
- import requests
4
- from bs4 import BeautifulSoup
5
- import urllib.parse
6
- import random
7
  import os
8
- from dotenv import load_dotenv
9
- import shutil
10
- import tempfile
11
-
12
- load_dotenv() # Load environment variables from .env file
13
-
14
- # Now replace the hard-coded token with the environment variable
15
- HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
16
-
17
- def clear_cache():
18
- try:
19
- # Clear Gradio cache
20
- cache_dir = tempfile.gettempdir()
21
- shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
22
-
23
- # Clear any custom cache you might have
24
- # For example, if you're caching PDF files or search results:
25
- if os.path.exists("output_summary.pdf"):
26
- os.remove("output_summary.pdf")
27
-
28
- # Add any other cache clearing operations here
29
-
30
- print("Cache cleared successfully.")
31
- return "Cache cleared successfully."
32
- except Exception as e:
33
- print(f"Error clearing cache: {e}")
34
- return f"Error clearing cache: {e}"
35
-
36
- _useragent_list = [
37
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
38
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
39
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
40
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
41
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
42
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
43
- ]
44
-
45
- # Function to extract visible text from HTML content of a webpage
46
- def extract_text_from_webpage(html):
47
- print("Extracting text from webpage...")
48
- soup = BeautifulSoup(html, 'html.parser')
49
- for script in soup(["script", "style"]):
50
- script.extract() # Remove scripts and styles
51
- text = soup.get_text()
52
- lines = (line.strip() for line in text.splitlines())
53
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
54
- text = '\n'.join(chunk for chunk in chunks if chunk)
55
- print(f"Extracted text length: {len(text)}")
56
- return text
57
-
58
- # Function to perform a Google search and retrieve results
59
- def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
60
- """Performs a Google search and returns the results."""
61
- print(f"Searching for term: {term}")
62
- escaped_term = urllib.parse.quote_plus(term)
63
- start = 0
64
- all_results = []
65
- max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
66
-
67
- with requests.Session() as session:
68
- while start < num_results:
69
- print(f"Fetching search results starting from: {start}")
70
- try:
71
- # Choose a random user agent
72
- user_agent = random.choice(_useragent_list)
73
- headers = {
74
- 'User-Agent': user_agent
75
- }
76
- print(f"Using User-Agent: {headers['User-Agent']}")
77
-
78
- resp = session.get(
79
- url="https://www.google.com/search",
80
- headers=headers,
81
- params={
82
- "q": term,
83
- "num": num_results - start,
84
- "hl": lang,
85
- "start": start,
86
- "safe": safe,
87
- },
88
- timeout=timeout,
89
- verify=ssl_verify,
90
- )
91
- resp.raise_for_status()
92
- except requests.exceptions.RequestException as e:
93
- print(f"Error fetching search results: {e}")
94
- break
95
-
96
- soup = BeautifulSoup(resp.text, "html.parser")
97
- result_block = soup.find_all("div", attrs={"class": "g"})
98
- if not result_block:
99
- print("No more results found.")
100
- break
101
- for result in result_block:
102
- link = result.find("a", href=True)
103
- if link:
104
- link = link["href"]
105
- print(f"Found link: {link}")
106
- try:
107
- webpage = session.get(link, headers=headers, timeout=timeout)
108
- webpage.raise_for_status()
109
- visible_text = extract_text_from_webpage(webpage.text)
110
- if len(visible_text) > max_chars_per_page:
111
- visible_text = visible_text[:max_chars_per_page] + "..."
112
- all_results.append({"link": link, "text": visible_text})
113
- except requests.exceptions.RequestException as e:
114
- print(f"Error fetching or processing {link}: {e}")
115
- all_results.append({"link": link, "text": None})
116
- else:
117
- print("No link found in result.")
118
- all_results.append({"link": None, "text": None})
119
- start += len(result_block)
120
- print(f"Total results fetched: {len(all_results)}")
121
- return all_results
122
-
123
- # Function to format the prompt for the Hugging Face API
124
- def format_prompt(query, search_results, instructions):
125
- formatted_results = ""
126
- for result in search_results:
127
- link = result["link"]
128
- text = result["text"]
129
- if link:
130
- formatted_results += f"URL: {link}\nContent: {text}\n{'-' * 80}\n"
131
- else:
132
- formatted_results += "No link found.\n" + '-' * 80 + '\n'
133
-
134
- prompt = f"{instructions}User Query: {query}\n\nWeb Search Results:\n{formatted_results}\n\nAssistant:"
135
- return prompt
136
-
137
- # Function to generate text using Hugging Face API
138
- def generate_text(input_text, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
139
- print("Generating text using Hugging Face API...")
140
- endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
141
- headers = {
142
- "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}", # Use the environment variable
143
- "Content-Type": "application/json"
144
- }
145
- data = {
146
- "inputs": input_text,
147
- "parameters": {
148
- "max_new_tokens": 8000, # Adjust as needed
149
- "temperature": temperature,
150
- "repetition_penalty": repetition_penalty,
151
- "top_p": top_p
152
- }
153
- }
154
-
155
- try:
156
- response = requests.post(endpoint, headers=headers, json=data)
157
- response.raise_for_status()
158
-
159
- # Check if response is JSON
160
- try:
161
- json_data = response.json()
162
- except ValueError:
163
- print("Response is not JSON.")
164
- return None
165
-
166
- # Extract generated text from response JSON
167
- if isinstance(json_data, list):
168
- # Handle list response (if applicable for your use case)
169
- generated_text = json_data[0].get("generated_text") if json_data else None
170
- elif isinstance(json_data, dict):
171
- # Handle dictionary response
172
- generated_text = json_data.get("generated_text")
173
- else:
174
- print("Unexpected response format.")
175
- return None
176
-
177
- if generated_text is not None:
178
- print("Text generation complete using Hugging Face API.")
179
- print(f"Generated text: {generated_text}") # Debugging line
180
- return generated_text
181
- else:
182
- print("Generated text not found in response.")
183
- return None
184
-
185
- except requests.exceptions.RequestException as e:
186
- print(f"Error generating text using Hugging Face API: {e}")
187
- return None
188
-
189
- # Function to read and extract text from a PDF
190
- def read_pdf(file_obj):
191
- with fitz.open(file_obj.name) as document:
192
- text = ""
193
- for page_num in range(document.page_count):
194
- page = document.load_page(page_num)
195
- text += page.get_text()
196
- return text
197
-
198
- # Function to format the prompt with instructions for text generation
199
- def format_prompt_with_instructions(text, instructions):
200
- prompt = f"{instructions}{text}\n\nAssistant:"
201
- return prompt
202
-
203
- # Function to save text to a PDF
204
- def save_text_to_pdf(text, output_path):
205
- print(f"Saving text to PDF at {output_path}...")
206
- doc = fitz.open() # Create a new PDF document
207
- page = doc.new_page() # Create a new page
208
-
209
- # Set the page margins
210
- margin = 50 # 50 points margin
211
- page_width = page.rect.width
212
- page_height = page.rect.height
213
- text_width = page_width - 2 * margin
214
- text_height = page_height - 2 * margin
215
-
216
- # Define font size and line spacing
217
- font_size = 9
218
- line_spacing = 1 * font_size
219
- fontname = "times-roman" # Use a supported font name
220
-
221
- # Process the text into lines that fit within the text_width
222
- lines = []
223
- current_line = ""
224
- current_line_width = 0
225
- words = text.split(" ")
226
- for word in words:
227
- word_width = fitz.get_text_length(word, fontname, font_size)
228
- if current_line_width + word_width <= text_width:
229
- current_line += word + " "
230
- current_line_width += word_width + fitz.get_text_length(" ", fontname, font_size)
231
- else:
232
- lines.append(current_line.strip())
233
- current_line = word + " "
234
- current_line_width = word_width + fitz.get_text_length(" ", fontname, font_size)
235
- if current_line:
236
- lines.append(current_line.strip())
237
-
238
- # Add the lines to the page with margins
239
- x = margin
240
- y = margin
241
- for line in lines:
242
- if y + line_spacing > text_height:
243
- # Create a new page if text exceeds the page height
244
- page = doc.new_page()
245
- y = margin # Reset y-coordinate for the new page
246
- page.insert_text((x, y), line, fontname=fontname, fontsize=font_size)
247
- y += line_spacing
248
-
249
- doc.save(output_path) # Save the PDF to the specified output path
250
- print(f"Text saved to PDF at {output_path}")
251
-
252
- # Function to process the PDF or search query and generate a summary
253
- def process_input(query_or_file, is_pdf, instructions, temperature, top_p, repetition_penalty):
254
- load_dotenv() # Load environment variables from .env file
255
-
256
- HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
257
-
258
- if is_pdf:
259
- print(f"Processing PDF: {query_or_file.name}")
260
- input_text = read_pdf(query_or_file)
261
- else:
262
- print(f"Processing search query: {query_or_file}")
263
- search_results = google_search(query_or_file)
264
- input_text = "\n\n".join(result["text"] for result in search_results if result["text"])
265
-
266
- # Split the input text into smaller chunks to fit within the token limit
267
- chunk_size = 1024 # Adjust as needed to stay within the token limit
268
- text_chunks = [input_text[i:i + chunk_size] for i in range(0, len(input_text), chunk_size)]
269
- print(f"Total number of chunks: {len(text_chunks)}")
270
-
271
- # Generate summaries for each chunk and concatenate them
272
- concatenated_summary = ""
273
- for chunk in text_chunks:
274
- prompt = format_prompt_with_instructions(chunk, instructions)
275
- chunk_summary = generate_text(prompt, temperature, repetition_penalty, top_p)
276
- concatenated_summary += f"{chunk_summary}\n\n"
277
-
278
- print("Final concatenated summary generated.")
279
- return concatenated_summary
280
-
281
- # Function to clear cache
282
- def clear_cache():
283
- try:
284
- # Clear Gradio cache
285
- cache_dir = tempfile.gettempdir()
286
- shutil.rmtree(os.path.join(cache_dir, "gradio"), ignore_errors=True)
287
-
288
- # Clear any custom cache you might have
289
- # For example, if you're caching PDF files or search results:
290
- if os.path.exists("output_summary.pdf"):
291
- os.remove("output_summary.pdf")
292
-
293
- # Add any other cache clearing operations here
294
-
295
- print("Cache cleared successfully.")
296
- return "Cache cleared successfully."
297
- except Exception as e:
298
- print(f"Error clearing cache: {e}")
299
- return f"Error clearing cache: {e}"
300
-
301
- def summarization_interface():
302
- with gr.Blocks() as demo:
303
- gr.Markdown("# PDF and Web Summarization Tool")
304
-
305
- with gr.Tab("Summarize PDF"):
306
- pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
307
- pdf_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
308
- pdf_temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.7, step=0.01)
309
- pdf_top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.01)
310
- pdf_repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=0.5, maximum=2.0, value=1.0, step=0.1)
311
- pdf_summary_output = gr.Textbox(label="Concatenated Summary Output")
312
- pdf_summarize_button = gr.Button("Generate Summary")
313
- pdf_clear_cache_button = gr.Button("Clear Cache")
314
-
315
- with gr.Tab("Summarize Web Search"):
316
- search_query = gr.Textbox(label="Enter Search Query", placeholder="Enter search query")
317
- search_instructions = gr.Textbox(label="Instructions for Summarization", placeholder="Enter instructions for summarization", lines=3)
318
- search_temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.7, step=0.01)
319
- search_top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.01)
320
- search_repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=0.5, maximum=2.0, value=1.0, step=0.1)
321
- search_summary_output = gr.Textbox(label="Concatenated Summary Output")
322
- search_summarize_button = gr.Button("Generate Summary")
323
- search_clear_cache_button = gr.Button("Clear Cache")
324
-
325
- # Bind functions to button clicks
326
- pdf_summarize_button.click(
327
- fn=lambda file, instructions, temperature, top_p, repetition_penalty: generate_and_save_summary(file, True, instructions, temperature, top_p, repetition_penalty),
328
- inputs=[pdf_file, pdf_instructions, pdf_temperature, pdf_top_p, pdf_repetition_penalty],
329
- outputs=[pdf_summary_output]
330
- )
331
- search_summarize_button.click(
332
- fn=lambda query, instructions, temperature, top_p, repetition_penalty: generate_and_save_summary(query, False, instructions, temperature, top_p, repetition_penalty),
333
- inputs=[search_query, search_instructions, search_temperature, search_top_p, search_repetition_penalty],
334
- outputs=[search_summary_output]
335
- )
336
- pdf_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=pdf_summary_output)
337
- search_clear_cache_button.click(fn=clear_cache, inputs=None, outputs=search_summary_output)
338
-
339
- return demo
340
-
341
- # Launch the Gradio interface
342
- demo = summarization_interface()
343
- demo.launch()
 
 
 
 
 
 
 
1
  import os
2
+ import json
3
+ import gradio as gr
4
+ from tempfile import NamedTemporaryFile
5
+
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_community.embeddings import HuggingFaceEmbeddings
11
+ from langchain_community.llms import HuggingFaceHub
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
14
+
15
+ huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
16
+
17
+ def load_and_split_document(file):
18
+ """Loads and splits the document into pages."""
19
+ loader = PyPDFLoader(file.name)
20
+ data = loader.load_and_split()
21
+ return data
22
+
23
+ def get_embeddings():
24
+ return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
25
+
26
+ def create_database(data, embeddings):
27
+ db = FAISS.from_documents(data, embeddings)
28
+ db.save_local("faiss_database")
29
+
30
+ prompt = """
31
+ Answer the question based only on the following context:
32
+ {context}
33
+ Question: {question}
34
+ """
35
+
36
+ def get_model():
37
+ return HuggingFaceHub(
38
+ repo_id="mistralai/Mistral-7B-Instruct-v0.3",
39
+ model_kwargs={"temperature": 0.5, "max_length": 512},
40
+ huggingfacehub_api_token=huggingface_token
41
+ )
42
+
43
+ def response(database, model, question):
44
+ prompt_val = ChatPromptTemplate.from_template(prompt)
45
+ retriever = database.as_retriever()
46
+ parser = StrOutputParser()
47
+ chain = (
48
+ {'context': retriever, 'question': RunnablePassthrough()}
49
+ | prompt_val
50
+ | model
51
+ | parser
52
+ )
53
+ ans = chain.invoke(question)
54
+ return ans
55
+
56
+ def update_vectors(file):
57
+ if file is None:
58
+ return "Please upload a PDF file."
59
+ data = load_and_split_document(file)
60
+ embed = get_embeddings()
61
+ create_database(data, embed)
62
+ return "Vector store updated successfully."
63
+
64
+ def ask_question(question):
65
+ if not question:
66
+ return "Please enter a question."
67
+ embed = get_embeddings()
68
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
69
+ model = get_model()
70
+ return response(database, model, question)
71
+
72
+ with gr.Blocks() as demo:
73
+ gr.Markdown("# Chat with your PDF documents")
74
+
75
+ with gr.Row():
76
+ file_input = gr.File(label="Upload your PDF document", file_types=[".pdf"])
77
+ update_button = gr.Button("Update Vector Store")
78
+
79
+ update_output = gr.Textbox(label="Update Status")
80
+ update_button.click(update_vectors, inputs=[file_input], outputs=update_output)
81
+
82
+ with gr.Row():
83
+ question_input = gr.Textbox(label="Ask a question about your documents")
84
+ submit_button = gr.Button("Submit")
85
+
86
+ answer_output = gr.Textbox(label="Answer")
87
+ submit_button.click(ask_question, inputs=[question_input], outputs=answer_output)
88
+
89
+ if __name__ == "__main__":
90
+ demo.launch()