File size: 1,773 Bytes
7516245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
from bs4 import BeautifulSoup
import re
from .language_detector import is_english

def scrape_article(url):
    """
    Scrape the content from a news article URL
    
    Args:
        url (str): The URL to scrape
        
    Returns:
        str: The extracted article content or error message
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract article content - this is a simple implementation
        article_text = ""
        
        # Try to find the main article content
        article = soup.find('article')
        if article:
            paragraphs = article.find_all('p')
        else:
            # Fallback to all paragraphs
            paragraphs = soup.find_all('p')
        
        # Extract text from paragraphs
        article_text = "\n\n".join([p.get_text().strip() for p in paragraphs])
        
        # Clean up the text
        article_text = re.sub(r'\s+', ' ', article_text).strip()
        
        # Trim to reasonable length for LLM processing
        if len(article_text) > 10000:
            article_text = article_text[:10000] + "..."
        
        # Verify the content is in English
        if not is_english(article_text[:500]):  # Check first 500 chars to save processing time
            return "Content not in English or insufficient text to analyze."
            
        return article_text
    
    except Exception as e:
        return f"Error scraping article: {str(e)}"