Shreyas094 commited on
Commit
a47e6ea
·
verified ·
1 Parent(s): b57529a

Create app.txt

Browse files
Files changed (1) hide show
  1. app.txt +283 -0
app.txt ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import gradio as gr
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import urllib.parse
6
+ import random
7
+ import os
8
+
9
+ _useragent_list = [
10
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
11
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
12
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
14
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
15
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
16
+ ]
17
+
18
+ # Function to extract visible text from HTML content of a webpage
19
+ def extract_text_from_webpage(html):
20
+ print("Extracting text from webpage...")
21
+ soup = BeautifulSoup(html, 'html.parser')
22
+ for script in soup(["script", "style"]):
23
+ script.extract() # Remove scripts and styles
24
+ text = soup.get_text()
25
+ lines = (line.strip() for line in text.splitlines())
26
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
27
+ text = '\n'.join(chunk for chunk in chunks if chunk)
28
+ print(f"Extracted text length: {len(text)}")
29
+ return text
30
+
31
+ # Function to perform a Google search and retrieve results
32
+ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
33
+ """Performs a Google search and returns the results."""
34
+ print(f"Searching for term: {term}")
35
+ escaped_term = urllib.parse.quote_plus(term)
36
+ start = 0
37
+ all_results = []
38
+ max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
39
+
40
+ with requests.Session() as session:
41
+ while start < num_results:
42
+ print(f"Fetching search results starting from: {start}")
43
+ try:
44
+ # Choose a random user agent
45
+ user_agent = random.choice(_useragent_list)
46
+ headers = {
47
+ 'User-Agent': user_agent
48
+ }
49
+ print(f"Using User-Agent: {headers['User-Agent']}")
50
+
51
+ resp = session.get(
52
+ url="https://www.google.com/search",
53
+ headers=headers,
54
+ params={
55
+ "q": term,
56
+ "num": num_results - start,
57
+ "hl": lang,
58
+ "start": start,
59
+ "safe": safe,
60
+ },
61
+ timeout=timeout,
62
+ verify=ssl_verify,
63
+ )
64
+ resp.raise_for_status()
65
+ except requests.exceptions.RequestException as e:
66
+ print(f"Error fetching search results: {e}")
67
+ break
68
+
69
+ soup = BeautifulSoup(resp.text, "html.parser")
70
+ result_block = soup.find_all("div", attrs={"class": "g"})
71
+ if not result_block:
72
+ print("No more results found.")
73
+ break
74
+ for result in result_block:
75
+ link = result.find("a", href=True)
76
+ if link:
77
+ link = link["href"]
78
+ print(f"Found link: {link}")
79
+ try:
80
+ webpage = session.get(link, headers=headers, timeout=timeout)
81
+ webpage.raise_for_status()
82
+ visible_text = extract_text_from_webpage(webpage.text)
83
+ if len(visible_text) > max_chars_per_page:
84
+ visible_text = visible_text[:max_chars_per_page] + "..."
85
+ all_results.append({"link": link, "text": visible_text})
86
+ except requests.exceptions.RequestException as e:
87
+ print(f"Error fetching or processing {link}: {e}")
88
+ all_results.append({"link": link, "text": None})
89
+ else:
90
+ print("No link found in result.")
91
+ all_results.append({"link": None, "text": None})
92
+ start += len(result_block)
93
+ print(f"Total results fetched: {len(all_results)}")
94
+ return all_results
95
+
96
+ # Function to format the prompt for the Hugging Face API
97
+ def format_prompt(query, search_results, instructions):
98
+ formatted_results = ""
99
+ for result in search_results:
100
+ link = result["link"]
101
+ text = result["text"]
102
+ if link:
103
+ formatted_results += f"URL: {link}\nContent: {text}\n{'-'*80}\n"
104
+ else:
105
+ formatted_results += "No link found.\n" + '-'*80 + '\n'
106
+
107
+ prompt = f"{instructions}User Query: {query}\n\nWeb Search Results:\n{formatted_results}\n\nAssistant:"
108
+ return prompt
109
+
110
+ # Function to generate text using Hugging Face API
111
+ def generate_text(input_text, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
112
+ print("Generating text using Hugging Face API...")
113
+ endpoint = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
114
+ headers = {
115
+ "Authorization": "Bearer ", # Replace with your Hugging Face API token
116
+ "Content-Type": "application/json"
117
+ }
118
+ data = {
119
+ "inputs": input_text,
120
+ "parameters": {
121
+ "max_new_tokens": 4000, # Adjust as needed
122
+ "temperature": temperature,
123
+ "repetition_penalty": repetition_penalty,
124
+ "top_p": top_p
125
+ }
126
+ }
127
+
128
+ try:
129
+ response = requests.post(endpoint, headers=headers, json=data)
130
+ response.raise_for_status()
131
+
132
+ # Check if response is JSON
133
+ try:
134
+ json_data = response.json()
135
+ except ValueError:
136
+ print("Response is not JSON.")
137
+ return None
138
+
139
+ # Extract generated text from response JSON
140
+ if isinstance(json_data, list):
141
+ # Handle list response (if applicable for your use case)
142
+ generated_text = json_data[0].get("generated_text") if json_data else None
143
+ elif isinstance(json_data, dict):
144
+ # Handle dictionary response
145
+ generated_text = json_data.get("generated_text")
146
+ else:
147
+ print("Unexpected response format.")
148
+ return None
149
+
150
+ if generated_text is not None:
151
+ print("Text generation complete using Hugging Face API.")
152
+ print(f"Generated text: {generated_text}") # Debugging line
153
+ return generated_text
154
+ else:
155
+ print("Generated text not found in response.")
156
+ return None
157
+
158
+ except requests.exceptions.RequestException as e:
159
+ print(f"Error generating text using Hugging Face API: {e}")
160
+ return None
161
+
162
+ # Function to read and extract text from a PDF
163
+ def read_pdf(file_obj):
164
+ with fitz.open(file_obj.name) as document:
165
+ text = ""
166
+ for page_num in range(document.page_count):
167
+ page = document.load_page(page_num)
168
+ text += page.get_text()
169
+ return text
170
+
171
+ # Function to format the prompt with instructions for text generation
172
+ def format_prompt_with_instructions(text, instructions):
173
+ prompt = f"{instructions}{text}\n\nAssistant:"
174
+ return prompt
175
+
176
+ # Function to save text to a PDF
177
+ def save_text_to_pdf(text, output_path):
178
+ print(f"Saving text to PDF at {output_path}...")
179
+ doc = fitz.open() # Create a new PDF document
180
+ page = doc.new_page() # Create a new page
181
+
182
+ # Set the page margins
183
+ margin = 50 # 50 points margin
184
+ page_width = page.rect.width
185
+ page_height = page.rect.height
186
+ text_width = page_width - 2 * margin
187
+ text_height = page_height - 2 * margin
188
+
189
+ # Define font size and line spacing
190
+ font_size = 9
191
+ line_spacing = 1 * font_size
192
+ fontname = "times-roman" # Use a supported font name
193
+
194
+ # Process the text to handle line breaks and paragraphs
195
+ paragraphs = text.split("\n") # Split text into paragraphs
196
+ y_position = margin
197
+
198
+ for paragraph in paragraphs:
199
+ words = paragraph.split()
200
+ current_line = ""
201
+
202
+ for word in words:
203
+ word = str(word) # Ensure word is treated as string
204
+ # Calculate the length of the current line plus the new word
205
+ current_line_length = fitz.get_text_length(current_line + " " + word, fontsize=font_size, fontname=fontname)
206
+ if current_line_length <= text_width:
207
+ current_line += " " + word
208
+ else:
209
+ page.insert_text(fitz.Point(margin, y_position), current_line.strip(), fontsize=font_size, fontname=fontname)
210
+ y_position += line_spacing
211
+ if y_position + line_spacing > page_height - margin:
212
+ page = doc.new_page() # Add a new page if text exceeds page height
213
+ y_position = margin
214
+ current_line = word
215
+
216
+ # Add the last line of the paragraph
217
+ page.insert_text(fitz.Point(margin, y_position), current_line.strip(), fontsize=font_size, fontname=fontname)
218
+ y_position += line_spacing
219
+
220
+ # Add extra space for new paragraph
221
+ y_position += line_spacing
222
+ if y_position + line_spacing > page_height - margin:
223
+ page = doc.new_page() # Add a new page if text exceeds page height
224
+ y_position = margin
225
+
226
+ doc.save(output_path) # Save the PDF to the specified path
227
+ print("PDF saved successfully.")
228
+
229
+
230
+
231
+
232
+ # Integrated function to perform web scraping, formatting, and text generation
233
+ def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
234
+ print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
235
+ if web_search:
236
+ search_results = google_search(query, num_results)
237
+ formatted_prompt = format_prompt(query, search_results, instructions)
238
+ generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
239
+ else:
240
+ formatted_prompt = format_prompt_with_instructions(query, instructions)
241
+ generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
242
+ print("Scraping and display complete.")
243
+ if generated_summary:
244
+ # Extract and return text starting from "Assistant:"
245
+ assistant_index = generated_summary.find("Assistant:")
246
+ if assistant_index != -1:
247
+ generated_summary = generated_summary[assistant_index:]
248
+ else:
249
+ generated_summary = "Assistant: No response generated."
250
+ print(f"Generated summary: {generated_summary}") # Debugging line
251
+ return generated_summary
252
+
253
+ # Main Gradio interface function
254
+ def gradio_interface(query, use_pdf, pdf, num_results, instructions, temperature, repetition_penalty, top_p):
255
+ if use_pdf and pdf is not None:
256
+ pdf_text = read_pdf(pdf)
257
+ generated_summary = scrape_and_display(pdf_text, num_results=0, instructions=instructions, web_search=False, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
258
+ else:
259
+ generated_summary = scrape_and_display(query, num_results=num_results, instructions=instructions, web_search=True, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
260
+
261
+ # Save the generated summary to a PDF
262
+ output_pdf_path = "output_summary.pdf"
263
+ save_text_to_pdf(generated_summary, output_pdf_path)
264
+
265
+ return generated_summary, output_pdf_path
266
+
267
+ # Deploy Gradio Interface
268
+ gr.Interface(
269
+ fn=gradio_interface,
270
+ inputs=[
271
+ gr.Textbox(label="Query"),
272
+ gr.Checkbox(label="Use PDF"),
273
+ gr.File(label="Upload PDF"),
274
+ gr.Slider(minimum=1, maximum=20, label="Number of Results"), # Added Slider for num_results
275
+ gr.Textbox(label="Instructions"),
276
+ gr.Slider(minimum=0.1, maximum=1.0, step=1, label="Temperature"),
277
+ gr.Slider(minimum=0.1, maximum=1.0, label="Repetition Penalty"),
278
+ gr.Slider(minimum=0.1, maximum=1.0, label="Top p")
279
+ ],
280
+ outputs=["text", "file"], # Updated to return text and a file
281
+ title="Financial Analyst AI Assistant",
282
+ description="Enter your query about a company's financials to get valuable insights. Optionally, upload a PDF for analysis.Please instruct me for curating your output template, also for web search you can modify my search results but its advisable to restrict the same at 10. You can also adjust my parameters like Temperature, Repetition Penalty and Top_P, its adivsable to set repetition penalty at 1 and other two parameters at 0.1.",
283
+ ).launch(share=True)