KingNish commited on
Commit
dd7d8ba
·
verified ·
1 Parent(s): 9771f4a

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +25 -42
chatbot.py CHANGED
@@ -224,47 +224,20 @@ def extract_images_from_msg_list(msg_list):
224
  all_images.append(c_)
225
  return all_images
226
 
227
- from duckduckgo_search import DDGS
228
- from threading import Thread
229
- import random
230
- from bs4 import BeautifulSoup
231
- from functools import lru_cache
232
- import requests
233
-
234
- _useragent_list = [
235
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
236
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
237
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
238
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
239
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
240
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
241
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
242
- ]
243
-
244
- def get_useragent():
245
- return random.choice(_useragent_list)
246
-
247
  @lru_cache(maxsize=128)
248
  def extract_text_from_webpage(html_content):
 
249
  soup = BeautifulSoup(html_content, "html.parser")
 
250
  for tag in soup(["script", "style", "header", "footer", "nav"]):
251
  tag.extract()
252
- return soup.get_text(strip=True)
253
-
254
- def fetch_and_extract(link, max_chars_per_page):
255
- """Fetches webpage content and extracts text."""
256
- try:
257
- webpage = requests.get(link, headers={"User-Agent": get_useragent()})
258
- webpage.raise_for_status()
259
- visible_text = extract_text_from_webpage(webpage.text)
260
- if len(visible_text) > max_chars_per_page:
261
- visible_text = visible_text[:max_chars_per_page] + "..."
262
- return {"link": link, "text": visible_text}
263
- except requests.exceptions.RequestException as e:
264
- return {"link": link, "text": None}
265
 
266
  # Perform a Google search and return the results
267
- def search(term, num_results=3, lang="en", timeout=5, safe="active", ssl_verify=None):
268
  """Performs a Google search and returns the results."""
269
  escaped_term = urllib.parse.quote_plus(term)
270
  start = 0
@@ -295,13 +268,22 @@ def search(term, num_results=3, lang="en", timeout=5, safe="active", ssl_verify=
295
  continue
296
  for result in result_block:
297
  link = result.find("a", href=True)
298
- link = link["href"]
299
- thread = Thread(target=lambda: all_results.append(fetch_and_extract(link, max_chars_per_page)))
300
- threads.append(thread)
301
- thread.start()
302
- for thread in threads:
303
- thread.join()
304
- gr.Info("Extracting Important Info..")
 
 
 
 
 
 
 
 
 
305
  return all_results
306
 
307
  # Format the prompt for the language model
@@ -330,7 +312,7 @@ def update_history(answer="", question=""):
330
  return history
331
 
332
  # Define a function for model inference
333
- @spaces.GPU(duration=45, queue=False)
334
  def model_inference(
335
  user_prompt,
336
  chat_history,
@@ -390,6 +372,7 @@ def model_inference(
390
  output += response.token.text
391
  yield output
392
  update_history(output, user_prompt)
 
393
  return
394
  else:
395
  if user_prompt["text"].strip() == "" and not user_prompt["files"]:
 
224
  all_images.append(c_)
225
  return all_images
226
 
227
+ # Perform a Google search and return the results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  @lru_cache(maxsize=128)
229
  def extract_text_from_webpage(html_content):
230
+ """Extracts visible text from HTML content using BeautifulSoup."""
231
  soup = BeautifulSoup(html_content, "html.parser")
232
+ # Remove unwanted tags
233
  for tag in soup(["script", "style", "header", "footer", "nav"]):
234
  tag.extract()
235
+ # Get the remaining visible text
236
+ visible_text = soup.get_text(strip=True)
237
+ return visible_text
 
 
 
 
 
 
 
 
 
 
238
 
239
  # Perform a Google search and return the results
240
+ def search(term, num_results=2, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
241
  """Performs a Google search and returns the results."""
242
  escaped_term = urllib.parse.quote_plus(term)
243
  start = 0
 
268
  continue
269
  for result in result_block:
270
  link = result.find("a", href=True)
271
+ if link:
272
+ link = link["href"]
273
+ try:
274
+ webpage = session.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
275
+ webpage.raise_for_status()
276
+ visible_text = extract_text_from_webpage(webpage.text)
277
+ # Truncate text if it's too long
278
+ if len(visible_text) > max_chars_per_page:
279
+ visible_text = visible_text[:max_chars_per_page] + "..."
280
+ all_results.append({"link": link, "text": visible_text})
281
+ except requests.exceptions.RequestException as e:
282
+ print(f"Error fetching or processing {link}: {e}")
283
+ all_results.append({"link": link, "text": None})
284
+ else:
285
+ all_results.append({"link": None, "text": None})
286
+ start += len(result_block)
287
  return all_results
288
 
289
  # Format the prompt for the language model
 
312
  return history
313
 
314
  # Define a function for model inference
315
+ @spaces.GPU(duration=30, queue=False)
316
  def model_inference(
317
  user_prompt,
318
  chat_history,
 
372
  output += response.token.text
373
  yield output
374
  update_history(output, user_prompt)
375
+ print(history)
376
  return
377
  else:
378
  if user_prompt["text"].strip() == "" and not user_prompt["files"]: