import requests
from bs4 import BeautifulSoup
import re
from .language_detector import is_english

def scrape_article(url):
    """
    Scrape the content from a news article URL
    
    Args:
        url (str): The URL to scrape
        
    Returns:
        str: The extracted article content or error message
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract article content - this is a simple implementation
        article_text = ""
        
        # Try to find the main article content
        article = soup.find('article')
        if article:
            paragraphs = article.find_all('p')
        else:
            # Fallback to all paragraphs
            paragraphs = soup.find_all('p')
        
        # Extract text from paragraphs
        article_text = "\n\n".join([p.get_text().strip() for p in paragraphs])
        
        # Clean up the text
        article_text = re.sub(r'\s+', ' ', article_text).strip()
        
        # Trim to reasonable length for LLM processing
        if len(article_text) > 10000:
            article_text = article_text[:10000] + "..."
        
        # Verify the content is in English
        if not is_english(article_text[:500]):  # Check first 500 chars to save processing time
            return "Content not in English or insufficient text to analyze."
            
        return article_text
    
    except Exception as e:
        return f"Error scraping article: {str(e)}"