Shreyas094 commited on
Commit
c1cdf7c
·
verified ·
1 Parent(s): 271131d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -36
app.py CHANGED
@@ -1,24 +1,17 @@
1
- import random
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
- from huggingface_hub import login
6
- import torch
7
  import os
 
8
 
9
- # Ensure sentencepiece is installed
10
- try:
11
- import sentencepiece
12
- except ImportError:
13
- raise ImportError("Please install the sentencepiece library using `pip install sentencepiece`.")
14
-
15
- # Retrieve the Hugging Face token from secrets (replace 'HUGGINGFACE_TOKEN' with your secret key)
16
- hf_token = os.getenv('HUGGINGFACE_TOKEN')
17
 
18
- # Log in to Hugging Face
19
- login(token=hf_token)
20
 
21
- # List of user agents
22
  _useragent_list = [
23
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
24
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
@@ -45,7 +38,7 @@ def extract_text_from_webpage(html):
45
  def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
46
  """Performs a Google search and returns the results."""
47
  print(f"Searching for term: {term}")
48
- escaped_term = requests.utils.quote(term)
49
  start = 0
50
  all_results = []
51
  max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
@@ -106,30 +99,191 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
106
  print(f"Total results fetched: {len(all_results)}")
107
  return all_results
108
 
109
- # Load the Mixtral-8x7B-Instruct model and tokenizer
110
- model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
111
- tokenizer = AutoTokenizer.from_pretrained(model_name)
112
- model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Check if a GPU is available and if not, fall back to CPU
115
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Check for GPU
116
- model.to(device) # Move model to the device
 
117
 
118
- # Example usage
119
- search_term = "How did Tesla perform in Q1 2024"
120
- search_results = google_search(search_term, num_results=3)
 
 
121
 
122
- # Combine text from search results to create a prompt
123
- combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])
 
 
 
 
124
 
125
- # Tokenize the input text
126
- inputs = tokenizer(combined_text, return_tensors="pt").to(device) # Move inputs to the device
 
 
127
 
128
- # Generate a response
129
- outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)
 
130
 
131
- # Decode the generated tokens to a readable string
132
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Print the response
135
- print(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import gradio as gr
3
  import requests
4
  from bs4 import BeautifulSoup
5
+ import urllib.parse
6
+ import random
 
7
  import os
8
+ from dotenv import load_dotenv
9
 
10
+ load_dotenv() # Load environment variables from .env file
 
 
 
 
 
 
 
11
 
12
+ # Now replace the hard-coded token with the environment variable
13
+ HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
14
 
 
15
  _useragent_list = [
16
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
17
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
 
38
  def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
39
  """Performs a Google search and returns the results."""
40
  print(f"Searching for term: {term}")
41
+ escaped_term = urllib.parse.quote_plus(term)
42
  start = 0
43
  all_results = []
44
  max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
 
99
  print(f"Total results fetched: {len(all_results)}")
100
  return all_results
101
 
102
+ # Function to format the prompt for the Hugging Face API
103
+ def format_prompt(query, search_results, instructions):
104
+ formatted_results = ""
105
+ for result in search_results:
106
+ link = result["link"]
107
+ text = result["text"]
108
+ if link:
109
+ formatted_results += f"URL: {link}\nContent: {text}\n{'-'*80}\n"
110
+ else:
111
+ formatted_results += "No link found.\n" + '-'*80 + '\n'
112
+
113
+ prompt = f"{instructions}User Query: {query}\n\nWeb Search Results:\n{formatted_results}\n\nAssistant:"
114
+ return prompt
115
+
116
+ # Function to generate text using Hugging Face API
117
+ def generate_text(input_text, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
118
+ print("Generating text using Hugging Face API...")
119
+ endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
120
+ headers = {
121
+ "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}", # Use the environment variable
122
+ "Content-Type": "application/json"
123
+ }
124
+ data = {
125
+ "inputs": input_text,
126
+ "parameters": {
127
+ "max_new_tokens": 4000, # Adjust as needed
128
+ "temperature": temperature,
129
+ "repetition_penalty": repetition_penalty,
130
+ "top_p": top_p
131
+ }
132
+ }
133
+
134
+ try:
135
+ response = requests.post(endpoint, headers=headers, json=data)
136
+ response.raise_for_status()
137
+
138
+ # Check if response is JSON
139
+ try:
140
+ json_data = response.json()
141
+ except ValueError:
142
+ print("Response is not JSON.")
143
+ return None
144
+
145
+ # Extract generated text from response JSON
146
+ if isinstance(json_data, list):
147
+ # Handle list response (if applicable for your use case)
148
+ generated_text = json_data[0].get("generated_text") if json_data else None
149
+ elif isinstance(json_data, dict):
150
+ # Handle dictionary response
151
+ generated_text = json_data.get("generated_text")
152
+ else:
153
+ print("Unexpected response format.")
154
+ return None
155
+
156
+ if generated_text is not None:
157
+ print("Text generation complete using Hugging Face API.")
158
+ print(f"Generated text: {generated_text}") # Debugging line
159
+ return generated_text
160
+ else:
161
+ print("Generated text not found in response.")
162
+ return None
163
+
164
+ except requests.exceptions.RequestException as e:
165
+ print(f"Error generating text using Hugging Face API: {e}")
166
+ return None
167
+
168
+ # Function to read and extract text from a PDF
169
+ def read_pdf(file_obj):
170
+ with fitz.open(file_obj.name) as document:
171
+ text = ""
172
+ for page_num in range(document.page_count):
173
+ page = document.load_page(page_num)
174
+ text += page.get_text()
175
+ return text
176
 
177
+ # Function to format the prompt with instructions for text generation
178
+ def format_prompt_with_instructions(text, instructions):
179
+ prompt = f"{instructions}{text}\n\nAssistant:"
180
+ return prompt
181
 
182
+ # Function to save text to a PDF
183
+ def save_text_to_pdf(text, output_path):
184
+ print(f"Saving text to PDF at {output_path}...")
185
+ doc = fitz.open() # Create a new PDF document
186
+ page = doc.new_page() # Create a new page
187
 
188
+ # Set the page margins
189
+ margin = 50 # 50 points margin
190
+ page_width = page.rect.width
191
+ page_height = page.rect.height
192
+ text_width = page_width - 2 * margin
193
+ text_height = page_height - 2 * margin
194
 
195
+ # Define font size and line spacing
196
+ font_size = 9
197
+ line_spacing = 1 * font_size
198
+ fontname = "times-roman" # Use a supported font name
199
 
200
+ # Process the text to handle line breaks and paragraphs
201
+ paragraphs = text.split("\n") # Split text into paragraphs
202
+ y_position = margin
203
 
204
+ for paragraph in paragraphs:
205
+ words = paragraph.split()
206
+ current_line = ""
207
+
208
+ for word in words:
209
+ word = str(word) # Ensure word is treated as string
210
+ # Calculate the length of the current line plus the new word
211
+ current_line_length = fitz.get_text_length(current_line + " " + word, fontsize=font_size, fontname=fontname)
212
+ if current_line_length <= text_width:
213
+ current_line += " " + word
214
+ else:
215
+ page.insert_text(fitz.Point(margin, y_position), current_line.strip(), fontsize=font_size, fontname=fontname)
216
+ y_position += line_spacing
217
+ if y_position + line_spacing > page_height - margin:
218
+ page = doc.new_page() # Add a new page if text exceeds page height
219
+ y_position = margin
220
+ current_line = word
221
+
222
+ # Add the last line of the paragraph
223
+ page.insert_text(fitz.Point(margin, y_position), current_line.strip(), fontsize=font_size, fontname=fontname)
224
+ y_position += line_spacing
225
+
226
+ # Add extra space for new paragraph
227
+ y_position += line_spacing
228
+ if y_position + line_spacing > page_height - margin:
229
+ page = doc.new_page() # Add a new page if text exceeds page height
230
+ y_position = margin
231
+
232
+ doc.save(output_path) # Save the PDF to the specified path
233
+ print("PDF saved successfully.")
234
+
235
+
236
+
237
+
238
+ # Integrated function to perform web scraping, formatting, and text generation
239
+ def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
240
+ print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
241
+ if web_search:
242
+ search_results = google_search(query, num_results)
243
+ formatted_prompt = format_prompt(query, search_results, instructions)
244
+ generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
245
+ else:
246
+ formatted_prompt = format_prompt_with_instructions(query, instructions)
247
+ generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
248
+ print("Scraping and display complete.")
249
+ if generated_summary:
250
+ # Extract and return text starting from "Assistant:"
251
+ assistant_index = generated_summary.find("Assistant:")
252
+ if assistant_index != -1:
253
+ generated_summary = generated_summary[assistant_index:]
254
+ else:
255
+ generated_summary = "Assistant: No response generated."
256
+ print(f"Generated summary: {generated_summary}") # Debugging line
257
+ return generated_summary
258
+
259
+ # Main Gradio interface function
260
+ def gradio_interface(query, use_pdf, pdf, num_results, instructions, temperature, repetition_penalty, top_p):
261
+ if use_pdf and pdf is not None:
262
+ pdf_text = read_pdf(pdf)
263
+ generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
264
+ else:
265
+ generated_summary = scrape_and_display(query, num_results=num_results, instructions=instructions, web_search=True, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
266
+
267
+ # Save the generated summary to a PDF
268
+ output_pdf_path = "output_summary.pdf"
269
+ save_text_to_pdf(generated_summary, output_pdf_path)
270
+
271
+ return generated_summary, output_pdf_path
272
 
273
+ # Deploy Gradio Interface
274
+ gr.Interface(
275
+ fn=gradio_interface,
276
+ inputs=[
277
+ gr.Textbox(label="Query"),
278
+ gr.Checkbox(label="Use PDF"),
279
+ gr.File(label="Upload PDF"),
280
+ gr.Slider(minimum=1, maximum=20, label="Number of Results"), # Added Slider for num_results
281
+ gr.Textbox(label="Instructions"),
282
+ gr.Slider(minimum=0.1, maximum=1.0, step=1, label="Temperature"),
283
+ gr.Slider(minimum=0.1, maximum=1.0, label="Repetition Penalty"),
284
+ gr.Slider(minimum=0.1, maximum=1.0, label="Top p")
285
+ ],
286
+ outputs=["text", "file"], # Updated to return text and a file
287
+ title="Financial Analyst AI Assistant",
288
+ description="Enter your query about a company's financials to get valuable insights. Optionally, upload a PDF for analysis.Please instruct me for curating your output template, also for web search you can modify my search results but its advisable to restrict the same at 10. You can also adjust my parameters like Temperature, Repetition Penalty and Top_P, its adivsable to set repetition penalty at 1 and other two parameters at 0.1.",
289
+ ).launch(share=True)