Spaces:
Runtime error
Runtime error
File size: 774 Bytes
2700879 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from CrawDag.scraping.Scraper import Scraper
from CrawDag.models import News
from newspaper import Article
class ScrapeArticle(Scraper):
def __init__(self, listNews: list[News]) -> None:
super().__init__(listNews)
def scrape(self) -> list[News]:
newsList: list[News] = []
for news in self.listNews:
article = Article(news.link)
article.download()
article.parse()
paragraphs = article.text.split('\n')
news.content = '\n'.join([para for para in paragraphs[0:-1]]).strip()
news.html = article.html
if news.content != '' and len(news.content) > 10: # check if the content is not empty
newsList.append(news)
return newsList |