File size: 1,789 Bytes
2700879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from CrawDag.models import TaskHandle, DataExchange, News
from .RssCrawler import ThanhNienCrawler, VnexpressCrawler
from .Crawler import Crawler
class CrawlingTask(TaskHandle):
    task_ids = None
    key = 'crawl_news'    

    def __init__(self, task_ids: str) -> None:
        super().__init__()
        CrawlingTask.task_ids = task_ids
        self.sources = [
            {
                'source': 'vnexpress',
                'type': 'rss',
                'topic': {
                    'economic': 'https://vnexpress.net/rss/kinh-doanh.rss',
                    'health': 'https://vnexpress.net/rss/suc-khoe.rss',
                    'sport': 'https://vnexpress.net/rss/the-thao.rss',
                    'politic': 'https://vnexpress.net/rss/the-gioi.rss'
                },
            },
            {
                'source': 'thanhnien',
                'type': 'rss',
                'topic': {
                    'economic': 'https://thanhnien.vn/rss/kinh-te.rss',
                    'health': 'https://thanhnien.vn/rss/suc-khoe.rss',
                    'sport': 'https://thanhnien.vn/rss/the-thao.rss',
                    'politic': 'https://thanhnien.vn/rss/chinh-tri.rss'
                },
            }
        ]

    def execute(self, **context: any):
        news: list[News] = []
        for source in self.sources:
            if source['source'] == 'vnexpress':
                crawler:Crawler = VnexpressCrawler(source['topic'])
            elif source['source'] == 'thanhnien':
                crawler:Crawler = ThanhNienCrawler(source['topic'])
            news.extend(crawler.crawl())
        news = news[:40]
        dataExchange = DataExchange(context['ti'])
        dataExchange.push(CrawlingTask.key, [new.to_json() for new in news])