Spaces:
Runtime error
Runtime error
File size: 865 Bytes
2700879 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from bs4 import BeautifulSoup
import requests
from CrawDag.models import News
from newspaper import Article
def clean_content(text: str) -> str:
"""Helper function to clean article content."""
return text.strip().replace("\n", " ").replace("\t", " ")
def scrape_basic_article(news: News):
response = requests.get(news.link, verify=False)
soup = BeautifulSoup(response.content, 'html.parser')
paragraphs = soup.find_all('p')
content = ' '.join([para.get_text() for para in paragraphs[0:-1]])
html = soup.find('article')
news.content = clean_content(content)
news.html = html
def scrape_news_v2(news: News):
article = Article(news.link)
article.download()
article.parse()
paragraphs = article.text.split('\n')
news.content = '\n'.join([para for para in paragraphs[0:-1]]).strip()
news.html = article.html |