import requests from bs4 import BeautifulSoup import re from .language_detector import is_english def scrape_article(url): """ Scrape the content from a news article URL Args: url (str): The URL to scrape Returns: str: The extracted article content or error message """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract article content - this is a simple implementation article_text = "" # Try to find the main article content article = soup.find('article') if article: paragraphs = article.find_all('p') else: # Fallback to all paragraphs paragraphs = soup.find_all('p') # Extract text from paragraphs article_text = "\n\n".join([p.get_text().strip() for p in paragraphs]) # Clean up the text article_text = re.sub(r'\s+', ' ', article_text).strip() # Trim to reasonable length for LLM processing if len(article_text) > 10000: article_text = article_text[:10000] + "..." # Verify the content is in English if not is_english(article_text[:500]): # Check first 500 chars to save processing time return "Content not in English or insufficient text to analyze." return article_text except Exception as e: return f"Error scraping article: {str(e)}"