Shreyas094 commited on
Commit
328806f
·
verified ·
1 Parent(s): 769a423

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -16
app.py CHANGED
@@ -84,16 +84,14 @@ def extract_text_from_webpage(html):
84
  return text
85
 
86
  # Function to perform a Google search and retrieve results
87
- def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
88
- """Performs a Google search and returns the results."""
89
  print(f"Searching for term: {term}")
90
  escaped_term = urllib.parse.quote_plus(term)
91
  start = 0
92
  all_results = []
93
- max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit
94
 
95
  with requests.Session() as session:
96
- while start < num_results:
97
  print(f"Fetching search results starting from: {start}")
98
  try:
99
  # Choose a random user agent
@@ -129,6 +127,8 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
129
  keywords = term.split() # Use the search term as keywords for filtering
130
 
131
  for result in result_block:
 
 
132
  link = result.find("a", href=True)
133
  if link:
134
  link = link["href"]
@@ -138,22 +138,49 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
138
  webpage.raise_for_status()
139
  visible_text = extract_text_from_webpage(webpage.text)
140
 
141
- # Apply preprocessing to the visible text
142
- preprocessed_text = preprocess_web_content(visible_text, keywords)
143
 
144
- if len(preprocessed_text) > max_chars_per_page:
145
- preprocessed_text = preprocessed_text[:max_chars_per_page] + "..."
146
- all_results.append({"link": link, "text": preprocessed_text})
147
  except requests.exceptions.RequestException as e:
148
  print(f"Error fetching or processing {link}: {e}")
149
  all_results.append({"link": link, "text": None})
150
  else:
151
  print("No link found in result.")
152
  all_results.append({"link": None, "text": None})
 
153
  start += len(result_block)
 
154
  print(f"Total results fetched: {len(all_results)}")
155
  return all_results
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def preprocess_text(text):
158
  # Remove HTML tags
159
  text = BeautifulSoup(text, "html.parser").get_text()
@@ -244,19 +271,19 @@ def format_prompt(query, search_results, instructions):
244
  formatted_results = ""
245
  for result in search_results:
246
  link = result["link"]
247
- text = result["text"]
248
- if link:
249
- formatted_results += f"URL: {link}\nContent: {text}\n{'-' * 80}\n"
250
  else:
251
- formatted_results += "No link found.\n" + '-' * 80 + '\n'
252
 
253
  prompt = f"""Instructions: {instructions}
254
  User Query: {query}
255
 
256
- Web Search Results:
257
  {formatted_results}
258
 
259
- Important: Provide a precise and factual response based solely on the information given above. Include specific dates, numbers, and sources where available. If exact information is not provided in the search results, clearly state that the information is not available in the given context. Do not make assumptions or provide information that is not directly supported by the search results.
260
 
261
  Assistant:"""
262
  return prompt
@@ -385,7 +412,7 @@ def save_text_to_pdf(text, output_path):
385
  def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
386
  print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
387
  if web_search:
388
- search_results = google_search(query, num_results)
389
  formatted_prompt = format_prompt(query, search_results, instructions)
390
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
391
  else:
 
84
  return text
85
 
86
  # Function to perform a Google search and retrieve results
87
+ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None, instructions=""):
 
88
  print(f"Searching for term: {term}")
89
  escaped_term = urllib.parse.quote_plus(term)
90
  start = 0
91
  all_results = []
 
92
 
93
  with requests.Session() as session:
94
+ while len(all_results) < num_results:
95
  print(f"Fetching search results starting from: {start}")
96
  try:
97
  # Choose a random user agent
 
127
  keywords = term.split() # Use the search term as keywords for filtering
128
 
129
  for result in result_block:
130
+ if len(all_results) >= num_results:
131
+ break
132
  link = result.find("a", href=True)
133
  if link:
134
  link = link["href"]
 
138
  webpage.raise_for_status()
139
  visible_text = extract_text_from_webpage(webpage.text)
140
 
141
+ # Summarize the webpage content
142
+ summary = summarize_webpage(link, visible_text, term, instructions)
143
 
144
+ all_results.append({"link": link, "text": summary})
 
 
145
  except requests.exceptions.RequestException as e:
146
  print(f"Error fetching or processing {link}: {e}")
147
  all_results.append({"link": link, "text": None})
148
  else:
149
  print("No link found in result.")
150
  all_results.append({"link": None, "text": None})
151
+
152
  start += len(result_block)
153
+
154
  print(f"Total results fetched: {len(all_results)}")
155
  return all_results
156
 
157
+ def summarize_webpage(url, content, query, instructions, max_chars=1000):
158
+ # Preprocess the content
159
+ preprocessed_text = preprocess_web_content(content, query.split())
160
+
161
+ # Format a prompt for this specific webpage
162
+ webpage_prompt = f"""
163
+ Instructions: {instructions}
164
+ Query: {query}
165
+ URL: {url}
166
+
167
+ Webpage content:
168
+ {preprocessed_text}
169
+
170
+ Summarize the above content in relation to the query. Focus on relevant information and include any specific data or facts mentioned. Keep the summary concise, ideally under 200 words.
171
+
172
+ Summary:
173
+ """
174
+
175
+ # Generate summary using the AI model
176
+ summary = generate_text(webpage_prompt, temperature=0.3, repetition_penalty=1.2, top_p=0.9)
177
+
178
+ # Truncate if necessary
179
+ if summary and len(summary) > max_chars:
180
+ summary = summary[:max_chars] + "..."
181
+
182
+ return summary
183
+
184
  def preprocess_text(text):
185
  # Remove HTML tags
186
  text = BeautifulSoup(text, "html.parser").get_text()
 
271
  formatted_results = ""
272
  for result in search_results:
273
  link = result["link"]
274
+ summary = result["text"]
275
+ if link and summary:
276
+ formatted_results += f"URL: {link}\nSummary: {summary}\n{'-' * 80}\n"
277
  else:
278
+ formatted_results += "No relevant information found.\n" + '-' * 80 + '\n'
279
 
280
  prompt = f"""Instructions: {instructions}
281
  User Query: {query}
282
 
283
+ Summarized Web Search Results:
284
  {formatted_results}
285
 
286
+ Based on the above summarized information from multiple sources, provide a comprehensive and factual response to the user's query. Include specific dates, numbers, and sources where available. If information is conflicting or unclear, mention this in your response. Do not make assumptions or provide information that is not supported by the summaries.
287
 
288
  Assistant:"""
289
  return prompt
 
412
  def scrape_and_display(query, num_results, instructions, web_search=True, temperature=0.7, repetition_penalty=1.0, top_p=0.9):
413
  print(f"Scraping and displaying results for query: {query} with num_results: {num_results}")
414
  if web_search:
415
+ search_results = google_search(query, num_results, instructions=instructions)
416
  formatted_prompt = format_prompt(query, search_results, instructions)
417
  generated_summary = generate_text(formatted_prompt, temperature=temperature, repetition_penalty=repetition_penalty, top_p=top_p)
418
  else: