Shreyas094 commited on
Commit
9988100
1 Parent(s): 7b7085f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -2
app.py CHANGED
@@ -6,6 +6,9 @@ from urllib.parse import urlparse
6
  from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
8
  from trafilatura import fetch_url, extract
 
 
 
9
  import json
10
  from huggingface_hub import InferenceClient
11
  import random
@@ -393,8 +396,16 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
393
  session.headers.update({'User-Agent': ua})
394
  content = scrape_with_bs4(url, session)
395
  else: # trafilatura
396
- downloaded = fetch_url(url, headers={'User-Agent': ua})
397
- content = extract(downloaded)
 
 
 
 
 
 
 
 
398
 
399
  if content:
400
  break
@@ -404,6 +415,9 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
404
  continue
405
  else:
406
  raise
 
 
 
407
 
408
  if not content:
409
  logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
 
6
  from requests.adapters import HTTPAdapter
7
  from requests.packages.urllib3.util.retry import Retry
8
  from trafilatura import fetch_url, extract
9
+ from trafilatura import extract
10
+ from trafilatura.settings import use_config
11
+ from urllib.request import urlopen, Request
12
  import json
13
  from huggingface_hub import InferenceClient
14
  import random
 
396
  session.headers.update({'User-Agent': ua})
397
  content = scrape_with_bs4(url, session)
398
  else: # trafilatura
399
+ # Use urllib to handle custom headers for trafilatura
400
+ req = Request(url, headers={'User-Agent': ua})
401
+ with urlopen(req) as response:
402
+ downloaded = response.read()
403
+
404
+ # Configure trafilatura to use a specific user agent
405
+ config = use_config()
406
+ config.set("DEFAULT", "USER_AGENT", ua)
407
+
408
+ content = extract(downloaded, config=config)
409
 
410
  if content:
411
  break
 
415
  continue
416
  else:
417
  raise
418
+ except Exception as e:
419
+ logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
420
+ continue
421
 
422
  if not content:
423
  logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")