AiNewsV2 / tools /scraper.py
Phoenix21's picture
Modular code
7516245
import requests
from bs4 import BeautifulSoup
import re
from .language_detector import is_english
def scrape_article(url):
"""
Scrape the content from a news article URL
Args:
url (str): The URL to scrape
Returns:
str: The extracted article content or error message
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract article content - this is a simple implementation
article_text = ""
# Try to find the main article content
article = soup.find('article')
if article:
paragraphs = article.find_all('p')
else:
# Fallback to all paragraphs
paragraphs = soup.find_all('p')
# Extract text from paragraphs
article_text = "\n\n".join([p.get_text().strip() for p in paragraphs])
# Clean up the text
article_text = re.sub(r'\s+', ' ', article_text).strip()
# Trim to reasonable length for LLM processing
if len(article_text) > 10000:
article_text = article_text[:10000] + "..."
# Verify the content is in English
if not is_english(article_text[:500]): # Check first 500 chars to save processing time
return "Content not in English or insufficient text to analyze."
return article_text
except Exception as e:
return f"Error scraping article: {str(e)}"