Shreyas094 commited on
Commit
1a81bf1
1 Parent(s): 6400d84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -70
app.py CHANGED
@@ -69,7 +69,7 @@ def is_valid_url(url):
69
  except ValueError:
70
  return False
71
 
72
- def scrape_with_bs4(url, session):
73
  try:
74
  response = session.get(url, timeout=5)
75
  response.raise_for_status()
@@ -78,20 +78,20 @@ def scrape_with_bs4(url, session):
78
  main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
79
 
80
  if main_content:
81
- content = main_content.get_text(strip=True)
82
  else:
83
- content = soup.get_text(strip=True)
84
 
85
- return content
86
  except Exception as e:
87
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
88
  return ""
89
 
90
- def scrape_with_trafilatura(url):
91
  try:
92
  downloaded = fetch_url(url)
93
- content = extract(downloaded)
94
- return content or ""
95
  except Exception as e:
96
  logger.error(f"Error scraping {url} with Trafilatura: {e}")
97
  return ""
@@ -371,72 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
371
  break
372
 
373
  for result in results:
374
- if len(scraped_content) >= num_results:
375
- break
376
-
377
- url = result.get('url', '')
378
- title = result.get('title', 'No title')
379
-
380
- if not is_valid_url(url):
381
- logger.warning(f"Invalid URL: {url}")
382
- continue
383
-
384
- try:
385
- logger.info(f"Scraping content from: {url}")
386
-
387
- # Implement a retry mechanism with different user agents
388
- user_agents = [
389
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
390
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
391
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
392
- ]
393
-
394
- content = ""
395
- for ua in user_agents:
396
- try:
397
- if scraper == "bs4":
398
- session.headers.update({'User-Agent': ua})
399
- content = scrape_with_bs4(url, session)
400
- else: # trafilatura
401
- # Use urllib to handle custom headers for trafilatura
402
- req = Request(url, headers={'User-Agent': ua})
403
- with urlopen(req) as response:
404
- downloaded = response.read()
405
-
406
- # Configure trafilatura to use a specific user agent
407
- config = use_config()
408
- config.set("DEFAULT", "USER_AGENT", ua)
409
-
410
- content = extract(downloaded, config=config)
411
 
412
- if content:
413
- break
414
- except requests.exceptions.HTTPError as e:
415
- if e.response.status_code == 403:
416
- logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
417
- continue
418
- else:
419
- raise
420
- except Exception as e:
421
- logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
 
422
  continue
423
-
424
- if not content:
425
- logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
 
426
  continue
427
-
428
- # Limit content to max_chars
429
- scraped_content.append({
430
- "title": title,
431
- "url": url,
432
- "content": content[:max_chars],
433
- "scraper": scraper
434
- })
435
- logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
436
- except requests.exceptions.RequestException as e:
437
- logger.error(f"Error scraping {url}: {e}")
438
- except Exception as e:
439
- logger.error(f"Unexpected error while scraping {url}: {e}")
 
 
 
440
 
441
  page += 1
442
 
 
69
  except ValueError:
70
  return False
71
 
72
+ def scrape_with_bs4(url, session, max_chars=None):
73
  try:
74
  response = session.get(url, timeout=5)
75
  response.raise_for_status()
 
78
  main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
79
 
80
  if main_content:
81
+ content = main_content.get_text(strip=True, separator='\n')
82
  else:
83
+ content = soup.get_text(strip=True, separator='\n')
84
 
85
+ return content[:max_chars] if max_chars else content
86
  except Exception as e:
87
  logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
88
  return ""
89
 
90
+ def scrape_with_trafilatura(url, max_chars=None):
91
  try:
92
  downloaded = fetch_url(url)
93
+ content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
94
+ return (content or "")[:max_chars] if max_chars else (content or "")
95
  except Exception as e:
96
  logger.error(f"Error scraping {url} with Trafilatura: {e}")
97
  return ""
 
371
  break
372
 
373
  for result in results:
374
+ if len(scraped_content) >= num_results:
375
+ break
376
+
377
+ url = result.get('url', '')
378
+ title = result.get('title', 'No title')
379
+
380
+ if not is_valid_url(url):
381
+ logger.warning(f"Invalid URL: {url}")
382
+ continue
383
+
384
+ try:
385
+ logger.info(f"Scraping content from: {url}")
386
+
387
+ # Implement a retry mechanism with different user agents
388
+ user_agents = [
389
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
390
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
391
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
392
+ ]
393
+
394
+ content = ""
395
+ for ua in user_agents:
396
+ try:
397
+ if scraper == "bs4":
398
+ session.headers.update({'User-Agent': ua})
399
+ content = scrape_with_bs4(url, session, max_chars)
400
+ else: # trafilatura
401
+ # Use urllib to handle custom headers for trafilatura
402
+ req = Request(url, headers={'User-Agent': ua})
403
+ with urlopen(req) as response:
404
+ downloaded = response.read()
 
 
 
 
 
 
405
 
406
+ # Configure trafilatura to use a specific user agent
407
+ config = use_config()
408
+ config.set("DEFAULT", "USER_AGENT", ua)
409
+
410
+ content = scrape_with_trafilatura(url, max_chars)
411
+
412
+ if content:
413
+ break
414
+ except requests.exceptions.HTTPError as e:
415
+ if e.response.status_code == 403:
416
+ logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
417
  continue
418
+ else:
419
+ raise
420
+ except Exception as e:
421
+ logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
422
  continue
423
+
424
+ if not content:
425
+ logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
426
+ continue
427
+
428
+ scraped_content.append({
429
+ "title": title,
430
+ "url": url,
431
+ "content": content, # No need to slice here as it's already limited
432
+ "scraper": scraper
433
+ })
434
+ logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
435
+ except requests.exceptions.RequestException as e:
436
+ logger.error(f"Error scraping {url}: {e}")
437
+ except Exception as e:
438
+ logger.error(f"Unexpected error while scraping {url}: {e}")
439
 
440
  page += 1
441