Spaces:

Phoenix21
/

AiNewsV2

Sleeping

AiNewsV2 / tools /scraper.py

Modular code

7516245 24 days ago

1.77 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	from .language_detector import is_english

	def scrape_article(url):
	"""
	Scrape the content from a news article URL

	Args:
	url (str): The URL to scrape

	Returns:
	str: The extracted article content or error message
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract article content - this is a simple implementation
	article_text = ""

	# Try to find the main article content
	article = soup.find('article')
	if article:
	paragraphs = article.find_all('p')
	else:
	# Fallback to all paragraphs
	paragraphs = soup.find_all('p')

	# Extract text from paragraphs
	article_text = "\n\n".join([p.get_text().strip() for p in paragraphs])

	# Clean up the text
	article_text = re.sub(r'\s+', ' ', article_text).strip()

	# Trim to reasonable length for LLM processing
	if len(article_text) > 10000:
	article_text = article_text[:10000] + "..."

	# Verify the content is in English
	if not is_english(article_text[:500]): # Check first 500 chars to save processing time
	return "Content not in English or insufficient text to analyze."

	return article_text

	except Exception as e:
	return f"Error scraping article: {str(e)}"