Shreyas094
commited on
Commit
•
ef24902
1
Parent(s):
84b4903
Update app.py
Browse files
app.py
CHANGED
@@ -123,7 +123,7 @@ def scrape_with_bs4(url, session):
|
|
123 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
124 |
def scrape_with_trafilatura(url):
|
125 |
try:
|
126 |
-
downloaded = fetch_url(url
|
127 |
if downloaded is None:
|
128 |
raise ScrapingError("Failed to download content")
|
129 |
content = extract(downloaded)
|
@@ -433,11 +433,19 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
433 |
else: # trafilatura
|
434 |
content = scrape_with_trafilatura(url)
|
435 |
|
436 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
scraped_content.append({
|
438 |
"title": title,
|
439 |
"url": url,
|
440 |
-
"content": content
|
441 |
"scraper": scraper
|
442 |
})
|
443 |
except requests.exceptions.RequestException as e:
|
|
|
123 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
124 |
def scrape_with_trafilatura(url):
|
125 |
try:
|
126 |
+
downloaded = fetch_url(url) # Remove the timeout parameter
|
127 |
if downloaded is None:
|
128 |
raise ScrapingError("Failed to download content")
|
129 |
content = extract(downloaded)
|
|
|
433 |
else: # trafilatura
|
434 |
content = scrape_with_trafilatura(url)
|
435 |
|
436 |
+
# Handle different types of content and limit to max_chars
|
437 |
+
if isinstance(content, dict) and 'content' in content:
|
438 |
+
content['content'] = content['content'][:max_chars]
|
439 |
+
elif isinstance(content, str):
|
440 |
+
content = content[:max_chars]
|
441 |
+
else:
|
442 |
+
logger.warning(f"Unexpected content type for URL: {url}")
|
443 |
+
content = str(content)[:max_chars]
|
444 |
+
|
445 |
scraped_content.append({
|
446 |
"title": title,
|
447 |
"url": url,
|
448 |
+
"content": content,
|
449 |
"scraper": scraper
|
450 |
})
|
451 |
except requests.exceptions.RequestException as e:
|