Spaces:
Runtime error
Runtime error
from CrawDag.models import TaskHandle, DataExchange, News | |
from .RssCrawler import ThanhNienCrawler, VnexpressCrawler | |
from .Crawler import Crawler | |
class CrawlingTask(TaskHandle): | |
task_ids = None | |
key = 'crawl_news' | |
def __init__(self, task_ids: str) -> None: | |
super().__init__() | |
CrawlingTask.task_ids = task_ids | |
self.sources = [ | |
{ | |
'source': 'vnexpress', | |
'type': 'rss', | |
'topic': { | |
'economic': 'https://vnexpress.net/rss/kinh-doanh.rss', | |
'health': 'https://vnexpress.net/rss/suc-khoe.rss', | |
'sport': 'https://vnexpress.net/rss/the-thao.rss', | |
'politic': 'https://vnexpress.net/rss/the-gioi.rss' | |
}, | |
}, | |
{ | |
'source': 'thanhnien', | |
'type': 'rss', | |
'topic': { | |
'economic': 'https://thanhnien.vn/rss/kinh-te.rss', | |
'health': 'https://thanhnien.vn/rss/suc-khoe.rss', | |
'sport': 'https://thanhnien.vn/rss/the-thao.rss', | |
'politic': 'https://thanhnien.vn/rss/chinh-tri.rss' | |
}, | |
} | |
] | |
def execute(self, **context: any): | |
news: list[News] = [] | |
for source in self.sources: | |
if source['source'] == 'vnexpress': | |
crawler:Crawler = VnexpressCrawler(source['topic']) | |
elif source['source'] == 'thanhnien': | |
crawler:Crawler = ThanhNienCrawler(source['topic']) | |
news.extend(crawler.crawl()) | |
news = news[:40] | |
dataExchange = DataExchange(context['ti']) | |
dataExchange.push(CrawlingTask.key, [new.to_json() for new in news]) | |