KingNish commited on
Commit
1c3458d
·
verified ·
1 Parent(s): 3736069

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +26 -32
chatbot.py CHANGED
@@ -230,14 +230,14 @@ def extract_text_from_webpage(html_content):
230
  """Extracts visible text from HTML content using BeautifulSoup."""
231
  soup = BeautifulSoup(html_content, "html.parser")
232
  # Remove unwanted tags
233
- for tag in soup(["script", "style", "header", "footer", "nav"]):
234
  tag.extract()
235
  # Get the remaining visible text
236
  visible_text = soup.get_text(strip=True)
237
  return visible_text
238
 
239
  # Perform a Google search and return the results
240
- def search(term, num_results=2, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
241
  """Performs a Google search and returns the results."""
242
  escaped_term = urllib.parse.quote_plus(term)
243
  start = 0
@@ -246,43 +246,37 @@ def search(term, num_results=2, lang="en", advanced=True, timeout=5, safe="activ
246
  max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
247
 
248
  with requests.Session() as session:
249
- while start < num_results:
250
- resp = session.get(
251
- url="https://www.google.com/search",
252
- headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"},
253
  params={
254
  "q": term,
255
- "num": num_results - start,
256
- "start": start,
257
  "udm": 14,
258
  },
259
  timeout=timeout,
260
  verify=ssl_verify,
261
- )
262
- resp.raise_for_status()
263
- soup = BeautifulSoup(resp.text, "html.parser")
264
- result_block = soup.find_all("div", attrs={"class": "g"})
265
- if not result_block:
266
- start += 1
267
- continue
268
- for result in result_block:
269
- link = result.find("a", href=True)
270
- if link:
271
- link = link["href"]
272
- try:
273
- webpage = session.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
274
- webpage.raise_for_status()
275
- visible_text = extract_text_from_webpage(webpage.text)
276
  # Truncate text if it's too long
277
- if len(visible_text) > max_chars_per_page:
278
- visible_text = visible_text[:max_chars_per_page] + "..."
279
- all_results.append({"link": link, "text": visible_text})
280
- except requests.exceptions.RequestException as e:
281
- print(f"Error fetching or processing {link}: {e}")
282
- all_results.append({"link": link, "text": None})
283
- else:
284
- all_results.append({"link": None, "text": None})
285
- start += len(result_block)
286
  return all_results
287
 
288
  # Format the prompt for the language model
 
230
  """Extracts visible text from HTML content using BeautifulSoup."""
231
  soup = BeautifulSoup(html_content, "html.parser")
232
  # Remove unwanted tags
233
+ for tag in soup(["script", "style", "header", "footer", "nav", "form", "svg"]):
234
  tag.extract()
235
  # Get the remaining visible text
236
  visible_text = soup.get_text(strip=True)
237
  return visible_text
238
 
239
  # Perform a Google search and return the results
240
+ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
241
  """Performs a Google search and returns the results."""
242
  escaped_term = urllib.parse.quote_plus(term)
243
  start = 0
 
246
  max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
247
 
248
  with requests.Session() as session:
249
+ resp = session.get(
250
+ url="https://www.google.com/search",
251
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"},
 
252
  params={
253
  "q": term,
254
+ "num": num_results,
 
255
  "udm": 14,
256
  },
257
  timeout=timeout,
258
  verify=ssl_verify,
259
+ )
260
+ resp.raise_for_status()
261
+ soup = BeautifulSoup(resp.text, "html.parser")
262
+ result_block = soup.find_all("div", attrs={"class": "g"})
263
+ for result in result_block:
264
+ link = result.find("a", href=True)
265
+ if link:
266
+ link = link["href"]
267
+ try:
268
+ webpage = session.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
269
+ webpage.raise_for_status()
270
+ visible_text = extract_text_from_webpage(webpage.text)
 
 
 
271
  # Truncate text if it's too long
272
+ if len(visible_text) > max_chars_per_page:
273
+ visible_text = visible_text[:max_chars_per_page]
274
+ all_results.append({"link": link, "text": visible_text})
275
+ except requests.exceptions.RequestException as e:
276
+ print(f"Error fetching or processing {link}: {e}")
277
+ all_results.append({"link": link, "text": None})
278
+ else:
279
+ all_results.append({"link": None, "text": None})
 
280
  return all_results
281
 
282
  # Format the prompt for the language model