Spaces:
Runtime error
Runtime error
File size: 1,789 Bytes
2700879 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from CrawDag.models import TaskHandle, DataExchange, News
from .RssCrawler import ThanhNienCrawler, VnexpressCrawler
from .Crawler import Crawler
class CrawlingTask(TaskHandle):
task_ids = None
key = 'crawl_news'
def __init__(self, task_ids: str) -> None:
super().__init__()
CrawlingTask.task_ids = task_ids
self.sources = [
{
'source': 'vnexpress',
'type': 'rss',
'topic': {
'economic': 'https://vnexpress.net/rss/kinh-doanh.rss',
'health': 'https://vnexpress.net/rss/suc-khoe.rss',
'sport': 'https://vnexpress.net/rss/the-thao.rss',
'politic': 'https://vnexpress.net/rss/the-gioi.rss'
},
},
{
'source': 'thanhnien',
'type': 'rss',
'topic': {
'economic': 'https://thanhnien.vn/rss/kinh-te.rss',
'health': 'https://thanhnien.vn/rss/suc-khoe.rss',
'sport': 'https://thanhnien.vn/rss/the-thao.rss',
'politic': 'https://thanhnien.vn/rss/chinh-tri.rss'
},
}
]
def execute(self, **context: any):
news: list[News] = []
for source in self.sources:
if source['source'] == 'vnexpress':
crawler:Crawler = VnexpressCrawler(source['topic'])
elif source['source'] == 'thanhnien':
crawler:Crawler = ThanhNienCrawler(source['topic'])
news.extend(crawler.crawl())
news = news[:40]
dataExchange = DataExchange(context['ti'])
dataExchange.push(CrawlingTask.key, [new.to_json() for new in news])
|