Spaces:

Eurosmart
/

airflow-pipeline

Runtime error

2700879 4 months ago

774 Bytes

	from CrawDag.scraping.Scraper import Scraper
	from CrawDag.models import News
	from newspaper import Article

	class ScrapeArticle(Scraper):
	def __init__(self, listNews: list[News]) -> None:
	super().__init__(listNews)

	def scrape(self) -> list[News]:
	newsList: list[News] = []
	for news in self.listNews:
	article = Article(news.link)
	article.download()
	article.parse()
	paragraphs = article.text.split('\n')
	news.content = '\n'.join([para for para in paragraphs[0:-1]]).strip()
	news.html = article.html
	if news.content != '' and len(news.content) > 10: # check if the content is not empty
	newsList.append(news)

	return newsList