|
import requests |
|
from bs4 import BeautifulSoup |
|
import re |
|
from .language_detector import is_english |
|
|
|
def scrape_article(url): |
|
""" |
|
Scrape the content from a news article URL |
|
|
|
Args: |
|
url (str): The URL to scrape |
|
|
|
Returns: |
|
str: The extracted article content or error message |
|
""" |
|
try: |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
response = requests.get(url, headers=headers, timeout=10) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
article_text = "" |
|
|
|
|
|
article = soup.find('article') |
|
if article: |
|
paragraphs = article.find_all('p') |
|
else: |
|
|
|
paragraphs = soup.find_all('p') |
|
|
|
|
|
article_text = "\n\n".join([p.get_text().strip() for p in paragraphs]) |
|
|
|
|
|
article_text = re.sub(r'\s+', ' ', article_text).strip() |
|
|
|
|
|
if len(article_text) > 10000: |
|
article_text = article_text[:10000] + "..." |
|
|
|
|
|
if not is_english(article_text[:500]): |
|
return "Content not in English or insufficient text to analyze." |
|
|
|
return article_text |
|
|
|
except Exception as e: |
|
return f"Error scraping article: {str(e)}" |