Shreyas094 commited on
Commit
ef24902
1 Parent(s): 84b4903

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -123,7 +123,7 @@ def scrape_with_bs4(url, session):
123
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
124
  def scrape_with_trafilatura(url):
125
  try:
126
- downloaded = fetch_url(url, timeout=10)
127
  if downloaded is None:
128
  raise ScrapingError("Failed to download content")
129
  content = extract(downloaded)
@@ -433,11 +433,19 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
433
  else: # trafilatura
434
  content = scrape_with_trafilatura(url)
435
 
436
- # Limit content to max_chars
 
 
 
 
 
 
 
 
437
  scraped_content.append({
438
  "title": title,
439
  "url": url,
440
- "content": content[:max_chars],
441
  "scraper": scraper
442
  })
443
  except requests.exceptions.RequestException as e:
 
123
  @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
124
  def scrape_with_trafilatura(url):
125
  try:
126
+ downloaded = fetch_url(url) # Remove the timeout parameter
127
  if downloaded is None:
128
  raise ScrapingError("Failed to download content")
129
  content = extract(downloaded)
 
433
  else: # trafilatura
434
  content = scrape_with_trafilatura(url)
435
 
436
+ # Handle different types of content and limit to max_chars
437
+ if isinstance(content, dict) and 'content' in content:
438
+ content['content'] = content['content'][:max_chars]
439
+ elif isinstance(content, str):
440
+ content = content[:max_chars]
441
+ else:
442
+ logger.warning(f"Unexpected content type for URL: {url}")
443
+ content = str(content)[:max_chars]
444
+
445
  scraped_content.append({
446
  "title": title,
447
  "url": url,
448
+ "content": content,
449
  "scraper": scraper
450
  })
451
  except requests.exceptions.RequestException as e: