File size: 1,472 Bytes
2700879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import requests
from datetime import datetime
from airflow import DAG
from airflow.operators.python import PythonOperator
from CrawDag.models import News
from CrawDag.crawling import CrawlingTask
from CrawDag.scraping import ScrapingTask
from CrawDag.saving import SavingTask
from CrawDag.sending import SendingTask
import pytz


with DAG(
    dag_id = 'CrawDag',
    description = 'Crawling news from multiple sources',
    # start_date = datetime(2025,1,1, 0, 0 ,0, 0, tzinfo=pytz.timezone('Asia/Ho_Chi_Minh')),
    schedule_interval='@monthly',
    start_date = datetime.now(tz=pytz.timezone('Asia/Ho_Chi_Minh')),
    # schedule_interval = '*/30 * * * *'

) as dag:
    crawl_task = PythonOperator(
        task_id = 'crawl_task',
        python_callable = CrawlingTask('crawl_task').execute,
        provide_context = True
    )

    scrape_task = PythonOperator(
        task_id = 'scrape_task',
        python_callable = ScrapingTask('scrape_task').execute,
        provide_context = True
    )

    save_task = PythonOperator(
        task_id = 'save_task',
        python_callable = SavingTask('save_task').execute,
        provide_context = True
    )

    sent_task = PythonOperator(
        task_id = 'sent_task',
        python_callable = SendingTask('sent_task').execute,
        provide_context = True
    )

    # crawl_task
    # crawl_task >> scrape_task
    # crawl_task >> scrape_task >> save_task
    crawl_task >> scrape_task >> save_task >> sent_task