Shreyas094
commited on
Commit
•
9988100
1
Parent(s):
7b7085f
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,9 @@ from urllib.parse import urlparse
|
|
6 |
from requests.adapters import HTTPAdapter
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
from trafilatura import fetch_url, extract
|
|
|
|
|
|
|
9 |
import json
|
10 |
from huggingface_hub import InferenceClient
|
11 |
import random
|
@@ -393,8 +396,16 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
393 |
session.headers.update({'User-Agent': ua})
|
394 |
content = scrape_with_bs4(url, session)
|
395 |
else: # trafilatura
|
396 |
-
|
397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
if content:
|
400 |
break
|
@@ -404,6 +415,9 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
404 |
continue
|
405 |
else:
|
406 |
raise
|
|
|
|
|
|
|
407 |
|
408 |
if not content:
|
409 |
logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
|
|
|
6 |
from requests.adapters import HTTPAdapter
|
7 |
from requests.packages.urllib3.util.retry import Retry
|
8 |
from trafilatura import fetch_url, extract
|
9 |
+
from trafilatura import extract
|
10 |
+
from trafilatura.settings import use_config
|
11 |
+
from urllib.request import urlopen, Request
|
12 |
import json
|
13 |
from huggingface_hub import InferenceClient
|
14 |
import random
|
|
|
396 |
session.headers.update({'User-Agent': ua})
|
397 |
content = scrape_with_bs4(url, session)
|
398 |
else: # trafilatura
|
399 |
+
# Use urllib to handle custom headers for trafilatura
|
400 |
+
req = Request(url, headers={'User-Agent': ua})
|
401 |
+
with urlopen(req) as response:
|
402 |
+
downloaded = response.read()
|
403 |
+
|
404 |
+
# Configure trafilatura to use a specific user agent
|
405 |
+
config = use_config()
|
406 |
+
config.set("DEFAULT", "USER_AGENT", ua)
|
407 |
+
|
408 |
+
content = extract(downloaded, config=config)
|
409 |
|
410 |
if content:
|
411 |
break
|
|
|
415 |
continue
|
416 |
else:
|
417 |
raise
|
418 |
+
except Exception as e:
|
419 |
+
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
420 |
+
continue
|
421 |
|
422 |
if not content:
|
423 |
logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
|