Spaces:
Runtime error
Runtime error
- .Dockerignore +6 -0
- .gitignore +7 -0
- Dockerfile +26 -0
- README.md +1 -0
- config/airflow.cfg +2767 -0
- dags/CrawDag.py +49 -0
- dags/CrawDag/__init__.py +0 -0
- dags/CrawDag/crawling/Crawler.py +9 -0
- dags/CrawDag/crawling/CrawlingTask.py +46 -0
- dags/CrawDag/crawling/RssCrawler/ThanhNienCrawler.py +35 -0
- dags/CrawDag/crawling/RssCrawler/VnexpressCrawler.py +35 -0
- dags/CrawDag/crawling/RssCrawler/__init__.py +2 -0
- dags/CrawDag/crawling/__init__.py +1 -0
- dags/CrawDag/models/DataExchange.py +11 -0
- dags/CrawDag/models/TaskHandle.py +5 -0
- dags/CrawDag/models/__init__.py +3 -0
- dags/CrawDag/models/news.py +42 -0
- dags/CrawDag/saving/DataLake.py +10 -0
- dags/CrawDag/saving/SavingMethod/MongoDataLake.py +45 -0
- dags/CrawDag/saving/SavingMethod/__init__.py +1 -0
- dags/CrawDag/saving/SavingTask.py +20 -0
- dags/CrawDag/saving/__init__.py +1 -0
- dags/CrawDag/scraping/ScrapeMethod/ScrapeArticle.py +21 -0
- dags/CrawDag/scraping/ScrapeMethod/ScrapeBasic.py +23 -0
- dags/CrawDag/scraping/ScrapeMethod/__init__.py +2 -0
- dags/CrawDag/scraping/Scraper.py +10 -0
- dags/CrawDag/scraping/ScrapingTask.py +20 -0
- dags/CrawDag/scraping/__init__.py +2 -0
- dags/CrawDag/scraping/basic_scraper.py +25 -0
- dags/CrawDag/sending/SendingTask.py +44 -0
- dags/CrawDag/sending/__init__.py +1 -0
- entrypoint.sh +24 -0
- requirements.txt +0 -0
.Dockerignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore Python cache files in all directories
|
2 |
+
**/__pycache__/
|
3 |
+
*.pyc
|
4 |
+
|
5 |
+
# Ignore virtual environment directory
|
6 |
+
venv/
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env.prod
|
2 |
+
# Ignore Python cache files in all directories
|
3 |
+
**/__pycache__/
|
4 |
+
*.pyc
|
5 |
+
|
6 |
+
# Ignore virtual environment directory
|
7 |
+
venv/
|
Dockerfile
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Apache Airflow image as the base
|
2 |
+
FROM apache/airflow:2.2.2-python3.9
|
3 |
+
|
4 |
+
# Set the working directory
|
5 |
+
WORKDIR /opt/airflow
|
6 |
+
|
7 |
+
# Copy necessary files into the container
|
8 |
+
COPY requirements.txt .
|
9 |
+
COPY entrypoint.sh /entrypoint.sh
|
10 |
+
COPY dags/ dags/
|
11 |
+
COPY config/ config/
|
12 |
+
|
13 |
+
# Install Python dependencies
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
# Set environment variables (if not using Render's built-in environment management)
|
17 |
+
ENV AIRFLOW_HOME=/opt/airflow
|
18 |
+
ENV AIRFLOW__CORE__LOAD_EXAMPLES=False
|
19 |
+
ENV AIRFLOW__CORE__DAGS_FOLDER=/opt/airflow/dags
|
20 |
+
|
21 |
+
|
22 |
+
# Expose Airflow's webserver port
|
23 |
+
EXPOSE 8080
|
24 |
+
|
25 |
+
# Run the entrypoint script when the container starts
|
26 |
+
ENTRYPOINT ["/entrypoint.sh"]
|
README.md
CHANGED
@@ -5,6 +5,7 @@ colorFrom: pink
|
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
pinned: false
|
|
|
8 |
---
|
9 |
|
10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
+
app_port: 8080
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
config/airflow.cfg
ADDED
@@ -0,0 +1,2767 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[core]
|
2 |
+
# The folder where your airflow pipelines live, most likely a
|
3 |
+
# subfolder in a code repository. This path must be absolute.
|
4 |
+
#
|
5 |
+
# Variable: AIRFLOW__CORE__DAGS_FOLDER
|
6 |
+
#
|
7 |
+
dags_folder = /opt/airflow/dags
|
8 |
+
|
9 |
+
# Hostname by providing a path to a callable, which will resolve the hostname.
|
10 |
+
# The format is "package.function".
|
11 |
+
#
|
12 |
+
# For example, default value ``airflow.utils.net.getfqdn`` means that result from patched
|
13 |
+
# version of `socket.getfqdn() <https://docs.python.org/3/library/socket.html#socket.getfqdn>`__,
|
14 |
+
# see related `CPython Issue <https://github.com/python/cpython/issues/49254>`__.
|
15 |
+
#
|
16 |
+
# No argument should be required in the function specified.
|
17 |
+
# If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
|
18 |
+
#
|
19 |
+
# Variable: AIRFLOW__CORE__HOSTNAME_CALLABLE
|
20 |
+
#
|
21 |
+
hostname_callable = airflow.utils.net.getfqdn
|
22 |
+
|
23 |
+
# A callable to check if a python file has airflow dags defined or not and should
|
24 |
+
# return ``True`` if it has dags otherwise ``False``.
|
25 |
+
# If this is not provided, Airflow uses its own heuristic rules.
|
26 |
+
#
|
27 |
+
# The function should have the following signature
|
28 |
+
#
|
29 |
+
# .. code-block:: python
|
30 |
+
#
|
31 |
+
# def func_name(file_path: str, zip_file: zipfile.ZipFile | None = None) -> bool: ...
|
32 |
+
#
|
33 |
+
# Variable: AIRFLOW__CORE__MIGHT_CONTAIN_DAG_CALLABLE
|
34 |
+
#
|
35 |
+
might_contain_dag_callable = airflow.utils.file.might_contain_dag_via_default_heuristic
|
36 |
+
|
37 |
+
# Default timezone in case supplied date times are naive
|
38 |
+
# can be `UTC` (default), `system`, or any `IANA <https://www.iana.org/time-zones>`
|
39 |
+
# timezone string (e.g. Europe/Amsterdam)
|
40 |
+
#
|
41 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TIMEZONE
|
42 |
+
#
|
43 |
+
default_timezone = Asia/Ho_Chi_Minh
|
44 |
+
|
45 |
+
# The executor class that airflow should use. Choices include
|
46 |
+
# ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``,
|
47 |
+
# ``KubernetesExecutor``, ``CeleryKubernetesExecutor``, ``LocalKubernetesExecutor`` or the
|
48 |
+
# full import path to the class when using a custom executor.
|
49 |
+
#
|
50 |
+
# Variable: AIRFLOW__CORE__EXECUTOR
|
51 |
+
#
|
52 |
+
executor = ${AIRFLOW__CORE__EXECUTOR}
|
53 |
+
|
54 |
+
# The auth manager class that airflow should use. Full import path to the auth manager class.
|
55 |
+
#
|
56 |
+
# Variable: AIRFLOW__CORE__AUTH_MANAGER
|
57 |
+
#
|
58 |
+
auth_manager = airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager
|
59 |
+
|
60 |
+
# This defines the maximum number of task instances that can run concurrently per scheduler in
|
61 |
+
# Airflow, regardless of the worker count. Generally this value, multiplied by the number of
|
62 |
+
# schedulers in your cluster, is the maximum number of task instances with the running
|
63 |
+
# state in the metadata database. Setting this value to zero allows unlimited parallelism.
|
64 |
+
#
|
65 |
+
# Variable: AIRFLOW__CORE__PARALLELISM
|
66 |
+
#
|
67 |
+
parallelism = 32
|
68 |
+
|
69 |
+
# The maximum number of task instances allowed to run concurrently in each DAG. To calculate
|
70 |
+
# the number of tasks that is running concurrently for a DAG, add up the number of running
|
71 |
+
# tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``,
|
72 |
+
# which is defaulted as ``[core] max_active_tasks_per_dag``.
|
73 |
+
#
|
74 |
+
# An example scenario when this would be useful is when you want to stop a new dag with an early
|
75 |
+
# start date from stealing all the executor slots in a cluster.
|
76 |
+
#
|
77 |
+
# Variable: AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG
|
78 |
+
#
|
79 |
+
max_active_tasks_per_dag = 16
|
80 |
+
|
81 |
+
# Are DAGs paused by default at creation
|
82 |
+
#
|
83 |
+
# Variable: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION
|
84 |
+
#
|
85 |
+
dags_are_paused_at_creation = True
|
86 |
+
|
87 |
+
# The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs
|
88 |
+
# if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``,
|
89 |
+
# which is defaulted as ``[core] max_active_runs_per_dag``.
|
90 |
+
#
|
91 |
+
# Variable: AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG
|
92 |
+
#
|
93 |
+
max_active_runs_per_dag = 16
|
94 |
+
|
95 |
+
# (experimental) The maximum number of consecutive DAG failures before DAG is automatically paused.
|
96 |
+
# This is also configurable per DAG level with ``max_consecutive_failed_dag_runs``,
|
97 |
+
# which is defaulted as ``[core] max_consecutive_failed_dag_runs_per_dag``.
|
98 |
+
# If not specified, then the value is considered as 0,
|
99 |
+
# meaning that the dags are never paused out by default.
|
100 |
+
#
|
101 |
+
# Variable: AIRFLOW__CORE__MAX_CONSECUTIVE_FAILED_DAG_RUNS_PER_DAG
|
102 |
+
#
|
103 |
+
max_consecutive_failed_dag_runs_per_dag = 0
|
104 |
+
|
105 |
+
# The name of the method used in order to start Python processes via the multiprocessing module.
|
106 |
+
# This corresponds directly with the options available in the Python docs:
|
107 |
+
# `multiprocessing.set_start_method
|
108 |
+
# <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.set_start_method>`__
|
109 |
+
# must be one of the values returned by `multiprocessing.get_all_start_methods()
|
110 |
+
# <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.get_all_start_methods>`__.
|
111 |
+
#
|
112 |
+
# Example: mp_start_method = fork
|
113 |
+
#
|
114 |
+
# Variable: AIRFLOW__CORE__MP_START_METHOD
|
115 |
+
#
|
116 |
+
# mp_start_method =
|
117 |
+
|
118 |
+
# Whether to load the DAG examples that ship with Airflow. It's good to
|
119 |
+
# get started, but you probably want to set this to ``False`` in a production
|
120 |
+
# environment
|
121 |
+
#
|
122 |
+
# Variable: AIRFLOW__CORE__LOAD_EXAMPLES
|
123 |
+
#
|
124 |
+
load_examples = True
|
125 |
+
|
126 |
+
# Path to the folder containing Airflow plugins
|
127 |
+
#
|
128 |
+
# Variable: AIRFLOW__CORE__PLUGINS_FOLDER
|
129 |
+
#
|
130 |
+
plugins_folder = /opt/airflow/plugins
|
131 |
+
|
132 |
+
# Should tasks be executed via forking of the parent process
|
133 |
+
#
|
134 |
+
# * ``False``: Execute via forking of the parent process
|
135 |
+
# * ``True``: Spawning a new python process, slower than fork, but means plugin changes picked
|
136 |
+
# up by tasks straight away
|
137 |
+
#
|
138 |
+
# Variable: AIRFLOW__CORE__EXECUTE_TASKS_NEW_PYTHON_INTERPRETER
|
139 |
+
#
|
140 |
+
execute_tasks_new_python_interpreter = False
|
141 |
+
|
142 |
+
# Secret key to save connection passwords in the db
|
143 |
+
#
|
144 |
+
# Variable: AIRFLOW__CORE__FERNET_KEY
|
145 |
+
#
|
146 |
+
fernet_key =
|
147 |
+
|
148 |
+
# Whether to disable pickling dags
|
149 |
+
#
|
150 |
+
# Variable: AIRFLOW__CORE__DONOT_PICKLE
|
151 |
+
#
|
152 |
+
donot_pickle = True
|
153 |
+
|
154 |
+
# How long before timing out a python file import
|
155 |
+
#
|
156 |
+
# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT
|
157 |
+
#
|
158 |
+
dagbag_import_timeout = 30.0
|
159 |
+
|
160 |
+
# Should a traceback be shown in the UI for dagbag import errors,
|
161 |
+
# instead of just the exception message
|
162 |
+
#
|
163 |
+
# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACKS
|
164 |
+
#
|
165 |
+
dagbag_import_error_tracebacks = True
|
166 |
+
|
167 |
+
# If tracebacks are shown, how many entries from the traceback should be shown
|
168 |
+
#
|
169 |
+
# Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH
|
170 |
+
#
|
171 |
+
dagbag_import_error_traceback_depth = 2
|
172 |
+
|
173 |
+
# How long before timing out a DagFileProcessor, which processes a dag file
|
174 |
+
#
|
175 |
+
# Variable: AIRFLOW__CORE__DAG_FILE_PROCESSOR_TIMEOUT
|
176 |
+
#
|
177 |
+
dag_file_processor_timeout = 50
|
178 |
+
|
179 |
+
# The class to use for running task instances in a subprocess.
|
180 |
+
# Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
|
181 |
+
# when using a custom task runner.
|
182 |
+
#
|
183 |
+
# Variable: AIRFLOW__CORE__TASK_RUNNER
|
184 |
+
#
|
185 |
+
task_runner = StandardTaskRunner
|
186 |
+
|
187 |
+
# If set, tasks without a ``run_as_user`` argument will be run with this user
|
188 |
+
# Can be used to de-elevate a sudo user running Airflow when executing tasks
|
189 |
+
#
|
190 |
+
# Variable: AIRFLOW__CORE__DEFAULT_IMPERSONATION
|
191 |
+
#
|
192 |
+
default_impersonation =
|
193 |
+
|
194 |
+
# What security module to use (for example kerberos)
|
195 |
+
#
|
196 |
+
# Variable: AIRFLOW__CORE__SECURITY
|
197 |
+
#
|
198 |
+
security =
|
199 |
+
|
200 |
+
# Turn unit test mode on (overwrites many configuration options with test
|
201 |
+
# values at runtime)
|
202 |
+
#
|
203 |
+
# Variable: AIRFLOW__CORE__UNIT_TEST_MODE
|
204 |
+
#
|
205 |
+
unit_test_mode = False
|
206 |
+
|
207 |
+
# Whether to enable pickling for xcom (note that this is insecure and allows for
|
208 |
+
# RCE exploits).
|
209 |
+
#
|
210 |
+
# Variable: AIRFLOW__CORE__ENABLE_XCOM_PICKLING
|
211 |
+
#
|
212 |
+
enable_xcom_pickling = False
|
213 |
+
|
214 |
+
# What classes can be imported during deserialization. This is a multi line value.
|
215 |
+
# The individual items will be parsed as a pattern to a glob function.
|
216 |
+
# Python built-in classes (like dict) are always allowed.
|
217 |
+
#
|
218 |
+
# Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES
|
219 |
+
#
|
220 |
+
allowed_deserialization_classes = airflow.*
|
221 |
+
|
222 |
+
# What classes can be imported during deserialization. This is a multi line value.
|
223 |
+
# The individual items will be parsed as regexp patterns.
|
224 |
+
# This is a secondary option to ``[core] allowed_deserialization_classes``.
|
225 |
+
#
|
226 |
+
# Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES_REGEXP
|
227 |
+
#
|
228 |
+
allowed_deserialization_classes_regexp =
|
229 |
+
|
230 |
+
# When a task is killed forcefully, this is the amount of time in seconds that
|
231 |
+
# it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
|
232 |
+
#
|
233 |
+
# Variable: AIRFLOW__CORE__KILLED_TASK_CLEANUP_TIME
|
234 |
+
#
|
235 |
+
killed_task_cleanup_time = 60
|
236 |
+
|
237 |
+
# Whether to override params with dag_run.conf. If you pass some key-value pairs
|
238 |
+
# through ``airflow dags backfill -c`` or
|
239 |
+
# ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
|
240 |
+
#
|
241 |
+
# Variable: AIRFLOW__CORE__DAG_RUN_CONF_OVERRIDES_PARAMS
|
242 |
+
#
|
243 |
+
dag_run_conf_overrides_params = True
|
244 |
+
|
245 |
+
# If enabled, Airflow will only scan files containing both ``DAG`` and ``airflow`` (case-insensitive).
|
246 |
+
#
|
247 |
+
# Variable: AIRFLOW__CORE__DAG_DISCOVERY_SAFE_MODE
|
248 |
+
#
|
249 |
+
dag_discovery_safe_mode = True
|
250 |
+
|
251 |
+
# The pattern syntax used in the
|
252 |
+
# `.airflowignore
|
253 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#airflowignore>`__
|
254 |
+
# files in the DAG directories. Valid values are ``regexp`` or ``glob``.
|
255 |
+
#
|
256 |
+
# Variable: AIRFLOW__CORE__DAG_IGNORE_FILE_SYNTAX
|
257 |
+
#
|
258 |
+
dag_ignore_file_syntax = regexp
|
259 |
+
|
260 |
+
# The number of retries each task is going to have by default. Can be overridden at dag or task level.
|
261 |
+
#
|
262 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRIES
|
263 |
+
#
|
264 |
+
default_task_retries = 0
|
265 |
+
|
266 |
+
# The number of seconds each task is going to wait by default between retries. Can be overridden at
|
267 |
+
# dag or task level.
|
268 |
+
#
|
269 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRY_DELAY
|
270 |
+
#
|
271 |
+
default_task_retry_delay = 300
|
272 |
+
|
273 |
+
# The maximum delay (in seconds) each task is going to wait by default between retries.
|
274 |
+
# This is a global setting and cannot be overridden at task or DAG level.
|
275 |
+
#
|
276 |
+
# Variable: AIRFLOW__CORE__MAX_TASK_RETRY_DELAY
|
277 |
+
#
|
278 |
+
max_task_retry_delay = 86400
|
279 |
+
|
280 |
+
# The weighting method used for the effective total priority weight of the task
|
281 |
+
#
|
282 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_WEIGHT_RULE
|
283 |
+
#
|
284 |
+
default_task_weight_rule = downstream
|
285 |
+
|
286 |
+
# Maximum possible time (in seconds) that task will have for execution of auxiliary processes
|
287 |
+
# (like listeners, mini scheduler...) after task is marked as success..
|
288 |
+
#
|
289 |
+
# Variable: AIRFLOW__CORE__TASK_SUCCESS_OVERTIME
|
290 |
+
#
|
291 |
+
task_success_overtime = 20
|
292 |
+
|
293 |
+
# The default task execution_timeout value for the operators. Expected an integer value to
|
294 |
+
# be passed into timedelta as seconds. If not specified, then the value is considered as None,
|
295 |
+
# meaning that the operators are never timed out by default.
|
296 |
+
#
|
297 |
+
# Variable: AIRFLOW__CORE__DEFAULT_TASK_EXECUTION_TIMEOUT
|
298 |
+
#
|
299 |
+
default_task_execution_timeout =
|
300 |
+
|
301 |
+
# Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
|
302 |
+
#
|
303 |
+
# Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_UPDATE_INTERVAL
|
304 |
+
#
|
305 |
+
min_serialized_dag_update_interval = 30
|
306 |
+
|
307 |
+
# If ``True``, serialized DAGs are compressed before writing to DB.
|
308 |
+
#
|
309 |
+
# .. note::
|
310 |
+
#
|
311 |
+
# This will disable the DAG dependencies view
|
312 |
+
#
|
313 |
+
# Variable: AIRFLOW__CORE__COMPRESS_SERIALIZED_DAGS
|
314 |
+
#
|
315 |
+
compress_serialized_dags = False
|
316 |
+
|
317 |
+
# Fetching serialized DAG can not be faster than a minimum interval to reduce database
|
318 |
+
# read rate. This config controls when your DAGs are updated in the Webserver
|
319 |
+
#
|
320 |
+
# Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_FETCH_INTERVAL
|
321 |
+
#
|
322 |
+
min_serialized_dag_fetch_interval = 10
|
323 |
+
|
324 |
+
# Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
|
325 |
+
# in the Database.
|
326 |
+
# All the template_fields for each of Task Instance are stored in the Database.
|
327 |
+
# Keeping this number small may cause an error when you try to view ``Rendered`` tab in
|
328 |
+
# TaskInstance view for older tasks.
|
329 |
+
#
|
330 |
+
# Variable: AIRFLOW__CORE__MAX_NUM_RENDERED_TI_FIELDS_PER_TASK
|
331 |
+
#
|
332 |
+
max_num_rendered_ti_fields_per_task = 30
|
333 |
+
|
334 |
+
# On each dagrun check against defined SLAs
|
335 |
+
#
|
336 |
+
# Variable: AIRFLOW__CORE__CHECK_SLAS
|
337 |
+
#
|
338 |
+
check_slas = True
|
339 |
+
|
340 |
+
# Path to custom XCom class that will be used to store and resolve operators results
|
341 |
+
#
|
342 |
+
# Example: xcom_backend = path.to.CustomXCom
|
343 |
+
#
|
344 |
+
# Variable: AIRFLOW__CORE__XCOM_BACKEND
|
345 |
+
#
|
346 |
+
xcom_backend = airflow.models.xcom.BaseXCom
|
347 |
+
|
348 |
+
# By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
|
349 |
+
# if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
|
350 |
+
#
|
351 |
+
# Variable: AIRFLOW__CORE__LAZY_LOAD_PLUGINS
|
352 |
+
#
|
353 |
+
lazy_load_plugins = True
|
354 |
+
|
355 |
+
# By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
|
356 |
+
# Set it to ``False``, if you want to discover providers whenever 'airflow' is invoked via cli or
|
357 |
+
# loaded from module.
|
358 |
+
#
|
359 |
+
# Variable: AIRFLOW__CORE__LAZY_DISCOVER_PROVIDERS
|
360 |
+
#
|
361 |
+
lazy_discover_providers = True
|
362 |
+
|
363 |
+
# Hide sensitive **Variables** or **Connection extra json keys** from UI
|
364 |
+
# and task logs when set to ``True``
|
365 |
+
#
|
366 |
+
# .. note::
|
367 |
+
#
|
368 |
+
# Connection passwords are always hidden in logs
|
369 |
+
#
|
370 |
+
# Variable: AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS
|
371 |
+
#
|
372 |
+
hide_sensitive_var_conn_fields = True
|
373 |
+
|
374 |
+
# A comma-separated list of extra sensitive keywords to look for in variables names or connection's
|
375 |
+
# extra JSON.
|
376 |
+
#
|
377 |
+
# Variable: AIRFLOW__CORE__SENSITIVE_VAR_CONN_NAMES
|
378 |
+
#
|
379 |
+
sensitive_var_conn_names =
|
380 |
+
|
381 |
+
# Task Slot counts for ``default_pool``. This setting would not have any effect in an existing
|
382 |
+
# deployment where the ``default_pool`` is already created. For existing deployments, users can
|
383 |
+
# change the number of slots using Webserver, API or the CLI
|
384 |
+
#
|
385 |
+
# Variable: AIRFLOW__CORE__DEFAULT_POOL_TASK_SLOT_COUNT
|
386 |
+
#
|
387 |
+
default_pool_task_slot_count = 128
|
388 |
+
|
389 |
+
# The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a
|
390 |
+
# length exceeding this value, the task pushing the XCom will be failed automatically to prevent the
|
391 |
+
# mapped tasks from clogging the scheduler.
|
392 |
+
#
|
393 |
+
# Variable: AIRFLOW__CORE__MAX_MAP_LENGTH
|
394 |
+
#
|
395 |
+
max_map_length = 1024
|
396 |
+
|
397 |
+
# The default umask to use for process when run in daemon mode (scheduler, worker, etc.)
|
398 |
+
#
|
399 |
+
# This controls the file-creation mode mask which determines the initial value of file permission bits
|
400 |
+
# for newly created files.
|
401 |
+
#
|
402 |
+
# This value is treated as an octal-integer.
|
403 |
+
#
|
404 |
+
# Variable: AIRFLOW__CORE__DAEMON_UMASK
|
405 |
+
#
|
406 |
+
daemon_umask = 0o077
|
407 |
+
|
408 |
+
# Class to use as dataset manager.
|
409 |
+
#
|
410 |
+
# Example: dataset_manager_class = airflow.datasets.manager.DatasetManager
|
411 |
+
#
|
412 |
+
# Variable: AIRFLOW__CORE__DATASET_MANAGER_CLASS
|
413 |
+
#
|
414 |
+
# dataset_manager_class =
|
415 |
+
|
416 |
+
# Kwargs to supply to dataset manager.
|
417 |
+
#
|
418 |
+
# Example: dataset_manager_kwargs = {"some_param": "some_value"}
|
419 |
+
#
|
420 |
+
# Variable: AIRFLOW__CORE__DATASET_MANAGER_KWARGS
|
421 |
+
#
|
422 |
+
# dataset_manager_kwargs =
|
423 |
+
|
424 |
+
# Dataset URI validation should raise an exception if it is not compliant with AIP-60.
|
425 |
+
# By default this configuration is false, meaning that Airflow 2.x only warns the user.
|
426 |
+
# In Airflow 3, this configuration will be enabled by default.
|
427 |
+
#
|
428 |
+
# Variable: AIRFLOW__CORE__STRICT_DATASET_URI_VALIDATION
|
429 |
+
#
|
430 |
+
strict_dataset_uri_validation = False
|
431 |
+
|
432 |
+
# (experimental) Whether components should use Airflow Internal API for DB connectivity.
|
433 |
+
#
|
434 |
+
# Variable: AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION
|
435 |
+
#
|
436 |
+
database_access_isolation = False
|
437 |
+
|
438 |
+
# (experimental) Airflow Internal API url.
|
439 |
+
# Only used if ``[core] database_access_isolation`` is ``True``.
|
440 |
+
#
|
441 |
+
# Example: internal_api_url = http://localhost:8080
|
442 |
+
#
|
443 |
+
# Variable: AIRFLOW__CORE__INTERNAL_API_URL
|
444 |
+
#
|
445 |
+
# internal_api_url =
|
446 |
+
|
447 |
+
# Secret key used to authenticate internal API clients to core. It should be as random as possible.
|
448 |
+
# However, when running more than 1 instances of webserver / internal API services, make sure all
|
449 |
+
# of them use the same ``secret_key`` otherwise calls will fail on authentication.
|
450 |
+
# The authentication token generated using the secret key has a short expiry time though - make
|
451 |
+
# sure that time on ALL the machines that you run airflow components on is synchronized
|
452 |
+
# (for example using ntpd) otherwise you might get "forbidden" errors when the logs are accessed.
|
453 |
+
#
|
454 |
+
# Variable: AIRFLOW__CORE__INTERNAL_API_SECRET_KEY
|
455 |
+
#
|
456 |
+
internal_api_secret_key = pDBrar+Nyodn8Sf5ZeJfVw==
|
457 |
+
|
458 |
+
# The ability to allow testing connections across Airflow UI, API and CLI.
|
459 |
+
# Supported options: ``Disabled``, ``Enabled``, ``Hidden``. Default: Disabled
|
460 |
+
# Disabled - Disables the test connection functionality and disables the Test Connection button in UI.
|
461 |
+
# Enabled - Enables the test connection functionality and shows the Test Connection button in UI.
|
462 |
+
# Hidden - Disables the test connection functionality and hides the Test Connection button in UI.
|
463 |
+
# Before setting this to Enabled, make sure that you review the users who are able to add/edit
|
464 |
+
# connections and ensure they are trusted. Connection testing can be done maliciously leading to
|
465 |
+
# undesired and insecure outcomes.
|
466 |
+
# See `Airflow Security Model: Capabilities of authenticated UI users
|
467 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/security/security_model.html#capabilities-of-authenticated-ui-users>`__
|
468 |
+
# for more details.
|
469 |
+
#
|
470 |
+
# Variable: AIRFLOW__CORE__TEST_CONNECTION
|
471 |
+
#
|
472 |
+
test_connection = Disabled
|
473 |
+
|
474 |
+
# The maximum length of the rendered template field. If the value to be stored in the
|
475 |
+
# rendered template field exceeds this size, it's redacted.
|
476 |
+
#
|
477 |
+
# Variable: AIRFLOW__CORE__MAX_TEMPLATED_FIELD_LENGTH
|
478 |
+
#
|
479 |
+
max_templated_field_length = 4096
|
480 |
+
|
481 |
+
[database]
|
482 |
+
# Path to the ``alembic.ini`` file. You can either provide the file path relative
|
483 |
+
# to the Airflow home directory or the absolute path if it is located elsewhere.
|
484 |
+
#
|
485 |
+
# Variable: AIRFLOW__DATABASE__ALEMBIC_INI_FILE_PATH
|
486 |
+
#
|
487 |
+
alembic_ini_file_path = alembic.ini
|
488 |
+
|
489 |
+
# The SQLAlchemy connection string to the metadata database.
|
490 |
+
# SQLAlchemy supports many different database engines.
|
491 |
+
# See: `Set up a Database Backend: Database URI
|
492 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri>`__
|
493 |
+
# for more details.
|
494 |
+
#
|
495 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN
|
496 |
+
#
|
497 |
+
sql_alchemy_conn = ${AIRFLOW__DATABASE__SQL_ALCHEMY_CONN}
|
498 |
+
|
499 |
+
# Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value
|
500 |
+
#
|
501 |
+
# Example: sql_alchemy_engine_args = {"arg1": true}
|
502 |
+
#
|
503 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS
|
504 |
+
#
|
505 |
+
# sql_alchemy_engine_args =
|
506 |
+
|
507 |
+
# The encoding for the databases
|
508 |
+
#
|
509 |
+
# Variable: AIRFLOW__DATABASE__SQL_ENGINE_ENCODING
|
510 |
+
#
|
511 |
+
sql_engine_encoding = utf-8
|
512 |
+
|
513 |
+
# Collation for ``dag_id``, ``task_id``, ``key``, ``external_executor_id`` columns
|
514 |
+
# in case they have different encoding.
|
515 |
+
# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb``
|
516 |
+
# the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed
|
517 |
+
# the maximum size of allowed index when collation is set to ``utf8mb4`` variant, see
|
518 |
+
# `GitHub Issue Comment <https://github.com/apache/airflow/pull/17603#issuecomment-901121618>`__
|
519 |
+
# for more details.
|
520 |
+
#
|
521 |
+
# Variable: AIRFLOW__DATABASE__SQL_ENGINE_COLLATION_FOR_IDS
|
522 |
+
#
|
523 |
+
# sql_engine_collation_for_ids =
|
524 |
+
|
525 |
+
# If SQLAlchemy should pool database connections.
|
526 |
+
#
|
527 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_ENABLED
|
528 |
+
#
|
529 |
+
sql_alchemy_pool_enabled = True
|
530 |
+
|
531 |
+
# The SQLAlchemy pool size is the maximum number of database connections
|
532 |
+
# in the pool. 0 indicates no limit.
|
533 |
+
#
|
534 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_SIZE
|
535 |
+
#
|
536 |
+
sql_alchemy_pool_size = 5
|
537 |
+
|
538 |
+
# The maximum overflow size of the pool.
|
539 |
+
# When the number of checked-out connections reaches the size set in pool_size,
|
540 |
+
# additional connections will be returned up to this limit.
|
541 |
+
# When those additional connections are returned to the pool, they are disconnected and discarded.
|
542 |
+
# It follows then that the total number of simultaneous connections the pool will allow
|
543 |
+
# is **pool_size** + **max_overflow**,
|
544 |
+
# and the total number of "sleeping" connections the pool will allow is pool_size.
|
545 |
+
# max_overflow can be set to ``-1`` to indicate no overflow limit;
|
546 |
+
# no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
|
547 |
+
#
|
548 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_MAX_OVERFLOW
|
549 |
+
#
|
550 |
+
sql_alchemy_max_overflow = 10
|
551 |
+
|
552 |
+
# The SQLAlchemy pool recycle is the number of seconds a connection
|
553 |
+
# can be idle in the pool before it is invalidated. This config does
|
554 |
+
# not apply to sqlite. If the number of DB connections is ever exceeded,
|
555 |
+
# a lower config value will allow the system to recover faster.
|
556 |
+
#
|
557 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_RECYCLE
|
558 |
+
#
|
559 |
+
sql_alchemy_pool_recycle = 1800
|
560 |
+
|
561 |
+
# Check connection at the start of each connection pool checkout.
|
562 |
+
# Typically, this is a simple statement like "SELECT 1".
|
563 |
+
# See `SQLAlchemy Pooling: Disconnect Handling - Pessimistic
|
564 |
+
# <https://docs.sqlalchemy.org/en/14/core/pooling.html#disconnect-handling-pessimistic>`__
|
565 |
+
# for more details.
|
566 |
+
#
|
567 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_PRE_PING
|
568 |
+
#
|
569 |
+
sql_alchemy_pool_pre_ping = True
|
570 |
+
|
571 |
+
# The schema to use for the metadata database.
|
572 |
+
# SQLAlchemy supports databases with the concept of multiple schemas.
|
573 |
+
#
|
574 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SCHEMA
|
575 |
+
#
|
576 |
+
sql_alchemy_schema =
|
577 |
+
|
578 |
+
# Import path for connect args in SQLAlchemy. Defaults to an empty dict.
|
579 |
+
# This is useful when you want to configure db engine args that SQLAlchemy won't parse
|
580 |
+
# in connection string. This can be set by passing a dictionary containing the create engine parameters.
|
581 |
+
# For more details about passing create engine parameters (keepalives variables, timeout etc)
|
582 |
+
# in Postgres DB Backend see `Setting up a PostgreSQL Database
|
583 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#setting-up-a-postgresql-database>`__
|
584 |
+
# e.g ``connect_args={"timeout":30}`` can be defined in ``airflow_local_settings.py`` and
|
585 |
+
# can be imported as shown below
|
586 |
+
#
|
587 |
+
# Example: sql_alchemy_connect_args = airflow_local_settings.connect_args
|
588 |
+
#
|
589 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONNECT_ARGS
|
590 |
+
#
|
591 |
+
# sql_alchemy_connect_args =
|
592 |
+
|
593 |
+
# Important Warning: Use of sql_alchemy_session_maker Highly Discouraged
|
594 |
+
# Import path for function which returns 'sqlalchemy.orm.sessionmaker'.
|
595 |
+
# Improper configuration of sql_alchemy_session_maker can lead to serious issues,
|
596 |
+
# including data corruption, unrecoverable application crashes. Please review the SQLAlchemy
|
597 |
+
# documentation for detailed guidance on proper configuration and best practices.
|
598 |
+
#
|
599 |
+
# Example: sql_alchemy_session_maker = airflow_local_settings._sessionmaker
|
600 |
+
#
|
601 |
+
# Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SESSION_MAKER
|
602 |
+
#
|
603 |
+
# sql_alchemy_session_maker =
|
604 |
+
|
605 |
+
# Whether to load the default connections that ship with Airflow when ``airflow db init`` is called.
|
606 |
+
# It's good to get started, but you probably want to set this to ``False`` in a production environment.
|
607 |
+
#
|
608 |
+
# Variable: AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS
|
609 |
+
#
|
610 |
+
load_default_connections = True
|
611 |
+
|
612 |
+
# Number of times the code should be retried in case of DB Operational Errors.
|
613 |
+
# Not all transactions will be retried as it can cause undesired state.
|
614 |
+
# Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
|
615 |
+
#
|
616 |
+
# Variable: AIRFLOW__DATABASE__MAX_DB_RETRIES
|
617 |
+
#
|
618 |
+
max_db_retries = 3
|
619 |
+
|
620 |
+
# Whether to run alembic migrations during Airflow start up. Sometimes this operation can be expensive,
|
621 |
+
# and the users can assert the correct version through other means (e.g. through a Helm chart).
|
622 |
+
# Accepts ``True`` or ``False``.
|
623 |
+
#
|
624 |
+
# Variable: AIRFLOW__DATABASE__CHECK_MIGRATIONS
|
625 |
+
#
|
626 |
+
check_migrations = True
|
627 |
+
|
628 |
+
[logging]
|
629 |
+
# The folder where airflow should store its log files.
|
630 |
+
# This path must be absolute.
|
631 |
+
# There are a few existing configurations that assume this is set to the default.
|
632 |
+
# If you choose to override this you may need to update the
|
633 |
+
# ``[logging] dag_processor_manager_log_location`` and
|
634 |
+
# ``[logging] child_process_log_directory settings`` as well.
|
635 |
+
#
|
636 |
+
# Variable: AIRFLOW__LOGGING__BASE_LOG_FOLDER
|
637 |
+
#
|
638 |
+
base_log_folder = /opt/airflow/logs
|
639 |
+
|
640 |
+
# Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
|
641 |
+
# Set this to ``True`` if you want to enable remote logging.
|
642 |
+
#
|
643 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_LOGGING
|
644 |
+
#
|
645 |
+
remote_logging = False
|
646 |
+
|
647 |
+
# Users must supply an Airflow connection id that provides access to the storage
|
648 |
+
# location. Depending on your remote logging service, this may only be used for
|
649 |
+
# reading logs, not writing them.
|
650 |
+
#
|
651 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID
|
652 |
+
#
|
653 |
+
remote_log_conn_id =
|
654 |
+
|
655 |
+
# Whether the local log files for GCS, S3, WASB and OSS remote logging should be deleted after
|
656 |
+
# they are uploaded to the remote location.
|
657 |
+
#
|
658 |
+
# Variable: AIRFLOW__LOGGING__DELETE_LOCAL_LOGS
|
659 |
+
#
|
660 |
+
delete_local_logs = False
|
661 |
+
|
662 |
+
# Path to Google Credential JSON file. If omitted, authorization based on `the Application Default
|
663 |
+
# Credentials
|
664 |
+
# <https://cloud.google.com/docs/authentication/application-default-credentials>`__ will
|
665 |
+
# be used.
|
666 |
+
#
|
667 |
+
# Variable: AIRFLOW__LOGGING__GOOGLE_KEY_PATH
|
668 |
+
#
|
669 |
+
google_key_path =
|
670 |
+
|
671 |
+
# Storage bucket URL for remote logging
|
672 |
+
# S3 buckets should start with **s3://**
|
673 |
+
# Cloudwatch log groups should start with **cloudwatch://**
|
674 |
+
# GCS buckets should start with **gs://**
|
675 |
+
# WASB buckets should start with **wasb** just to help Airflow select correct handler
|
676 |
+
# Stackdriver logs should start with **stackdriver://**
|
677 |
+
#
|
678 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER
|
679 |
+
#
|
680 |
+
remote_base_log_folder =
|
681 |
+
|
682 |
+
# The remote_task_handler_kwargs param is loaded into a dictionary and passed to the ``__init__``
|
683 |
+
# of remote task handler and it overrides the values provided by Airflow config. For example if you set
|
684 |
+
# ``delete_local_logs=False`` and you provide ``{"delete_local_copy": true}``, then the local
|
685 |
+
# log files will be deleted after they are uploaded to remote location.
|
686 |
+
#
|
687 |
+
# Example: remote_task_handler_kwargs = {"delete_local_copy": true}
|
688 |
+
#
|
689 |
+
# Variable: AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS
|
690 |
+
#
|
691 |
+
remote_task_handler_kwargs =
|
692 |
+
|
693 |
+
# Use server-side encryption for logs stored in S3
|
694 |
+
#
|
695 |
+
# Variable: AIRFLOW__LOGGING__ENCRYPT_S3_LOGS
|
696 |
+
#
|
697 |
+
encrypt_s3_logs = False
|
698 |
+
|
699 |
+
# Logging level.
|
700 |
+
#
|
701 |
+
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
702 |
+
#
|
703 |
+
# Variable: AIRFLOW__LOGGING__LOGGING_LEVEL
|
704 |
+
#
|
705 |
+
logging_level = INFO
|
706 |
+
|
707 |
+
# Logging level for celery. If not set, it uses the value of logging_level
|
708 |
+
#
|
709 |
+
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
710 |
+
#
|
711 |
+
# Variable: AIRFLOW__LOGGING__CELERY_LOGGING_LEVEL
|
712 |
+
#
|
713 |
+
celery_logging_level =
|
714 |
+
|
715 |
+
# Logging level for Flask-appbuilder UI.
|
716 |
+
#
|
717 |
+
# Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
|
718 |
+
#
|
719 |
+
# Variable: AIRFLOW__LOGGING__FAB_LOGGING_LEVEL
|
720 |
+
#
|
721 |
+
fab_logging_level = WARNING
|
722 |
+
|
723 |
+
# Logging class
|
724 |
+
# Specify the class that will specify the logging configuration
|
725 |
+
# This class has to be on the python classpath
|
726 |
+
#
|
727 |
+
# Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
|
728 |
+
#
|
729 |
+
# Variable: AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS
|
730 |
+
#
|
731 |
+
logging_config_class =
|
732 |
+
|
733 |
+
# Flag to enable/disable Colored logs in Console
|
734 |
+
# Colour the logs when the controlling terminal is a TTY.
|
735 |
+
#
|
736 |
+
# Variable: AIRFLOW__LOGGING__COLORED_CONSOLE_LOG
|
737 |
+
#
|
738 |
+
colored_console_log = True
|
739 |
+
|
740 |
+
# Log format for when Colored logs is enabled
|
741 |
+
#
|
742 |
+
# Variable: AIRFLOW__LOGGING__COLORED_LOG_FORMAT
|
743 |
+
#
|
744 |
+
colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
|
745 |
+
|
746 |
+
# Specifies the class utilized by Airflow to implement colored logging
|
747 |
+
#
|
748 |
+
# Variable: AIRFLOW__LOGGING__COLORED_FORMATTER_CLASS
|
749 |
+
#
|
750 |
+
colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
|
751 |
+
|
752 |
+
# Format of Log line
|
753 |
+
#
|
754 |
+
# Variable: AIRFLOW__LOGGING__LOG_FORMAT
|
755 |
+
#
|
756 |
+
log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
|
757 |
+
|
758 |
+
# Defines the format of log messages for simple logging configuration
|
759 |
+
#
|
760 |
+
# Variable: AIRFLOW__LOGGING__SIMPLE_LOG_FORMAT
|
761 |
+
#
|
762 |
+
simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
|
763 |
+
|
764 |
+
# Where to send dag parser logs. If "file", logs are sent to log files defined by child_process_log_directory.
|
765 |
+
#
|
766 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_TARGET
|
767 |
+
#
|
768 |
+
dag_processor_log_target = file
|
769 |
+
|
770 |
+
# Format of Dag Processor Log line
|
771 |
+
#
|
772 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_FORMAT
|
773 |
+
#
|
774 |
+
dag_processor_log_format = [%%(asctime)s] [SOURCE:DAG_PROCESSOR] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
|
775 |
+
|
776 |
+
# Determines the formatter class used by Airflow for structuring its log messages
|
777 |
+
# The default formatter class is timezone-aware, which means that timestamps attached to log entries
|
778 |
+
# will be adjusted to reflect the local timezone of the Airflow instance
|
779 |
+
#
|
780 |
+
# Variable: AIRFLOW__LOGGING__LOG_FORMATTER_CLASS
|
781 |
+
#
|
782 |
+
log_formatter_class = airflow.utils.log.timezone_aware.TimezoneAware
|
783 |
+
|
784 |
+
# An import path to a function to add adaptations of each secret added with
|
785 |
+
# ``airflow.utils.log.secrets_masker.mask_secret`` to be masked in log messages. The given function
|
786 |
+
# is expected to require a single parameter: the secret to be adapted. It may return a
|
787 |
+
# single adaptation of the secret or an iterable of adaptations to each be masked as secrets.
|
788 |
+
# The original secret will be masked as well as any adaptations returned.
|
789 |
+
#
|
790 |
+
# Example: secret_mask_adapter = urllib.parse.quote
|
791 |
+
#
|
792 |
+
# Variable: AIRFLOW__LOGGING__SECRET_MASK_ADAPTER
|
793 |
+
#
|
794 |
+
secret_mask_adapter =
|
795 |
+
|
796 |
+
# Specify prefix pattern like mentioned below with stream handler ``TaskHandlerWithCustomFormatter``
|
797 |
+
#
|
798 |
+
# Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{ti.try_number}}
|
799 |
+
#
|
800 |
+
# Variable: AIRFLOW__LOGGING__TASK_LOG_PREFIX_TEMPLATE
|
801 |
+
#
|
802 |
+
task_log_prefix_template =
|
803 |
+
|
804 |
+
# Formatting for how airflow generates file names/paths for each task run.
|
805 |
+
#
|
806 |
+
# Variable: AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE
|
807 |
+
#
|
808 |
+
log_filename_template = dag_id={{ ti.dag_id }}/run_id={{ ti.run_id }}/task_id={{ ti.task_id }}/{%% if ti.map_index >= 0 %%}map_index={{ ti.map_index }}/{%% endif %%}attempt={{ try_number }}.log
|
809 |
+
|
810 |
+
# Formatting for how airflow generates file names for log
|
811 |
+
#
|
812 |
+
# Variable: AIRFLOW__LOGGING__LOG_PROCESSOR_FILENAME_TEMPLATE
|
813 |
+
#
|
814 |
+
log_processor_filename_template = {{ filename }}.log
|
815 |
+
|
816 |
+
# Full path of dag_processor_manager logfile.
|
817 |
+
#
|
818 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_LOCATION
|
819 |
+
#
|
820 |
+
dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log
|
821 |
+
|
822 |
+
# Whether DAG processor manager will write logs to stdout
|
823 |
+
#
|
824 |
+
# Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_STDOUT
|
825 |
+
#
|
826 |
+
dag_processor_manager_log_stdout = False
|
827 |
+
|
828 |
+
# Name of handler to read task instance logs.
|
829 |
+
# Defaults to use ``task`` handler.
|
830 |
+
#
|
831 |
+
# Variable: AIRFLOW__LOGGING__TASK_LOG_READER
|
832 |
+
#
|
833 |
+
task_log_reader = task
|
834 |
+
|
835 |
+
# A comma\-separated list of third-party logger names that will be configured to print messages to
|
836 |
+
# consoles\.
|
837 |
+
#
|
838 |
+
# Example: extra_logger_names = connexion,sqlalchemy
|
839 |
+
#
|
840 |
+
# Variable: AIRFLOW__LOGGING__EXTRA_LOGGER_NAMES
|
841 |
+
#
|
842 |
+
extra_logger_names =
|
843 |
+
|
844 |
+
# When you start an Airflow worker, Airflow starts a tiny web server
|
845 |
+
# subprocess to serve the workers local log files to the airflow main
|
846 |
+
# web server, who then builds pages and sends them to users. This defines
|
847 |
+
# the port on which the logs are served. It needs to be unused, and open
|
848 |
+
# visible from the main web server to connect into the workers.
|
849 |
+
#
|
850 |
+
# Variable: AIRFLOW__LOGGING__WORKER_LOG_SERVER_PORT
|
851 |
+
#
|
852 |
+
worker_log_server_port = 8793
|
853 |
+
|
854 |
+
# Port to serve logs from for triggerer.
|
855 |
+
# See ``[logging] worker_log_server_port`` description for more info.
|
856 |
+
#
|
857 |
+
# Variable: AIRFLOW__LOGGING__TRIGGER_LOG_SERVER_PORT
|
858 |
+
#
|
859 |
+
trigger_log_server_port = 8794
|
860 |
+
|
861 |
+
# We must parse timestamps to interleave logs between trigger and task. To do so,
|
862 |
+
# we need to parse timestamps in log files. In case your log format is non-standard,
|
863 |
+
# you may provide import path to callable which takes a string log line and returns
|
864 |
+
# the timestamp (datetime.datetime compatible).
|
865 |
+
#
|
866 |
+
# Example: interleave_timestamp_parser = path.to.my_func
|
867 |
+
#
|
868 |
+
# Variable: AIRFLOW__LOGGING__INTERLEAVE_TIMESTAMP_PARSER
|
869 |
+
#
|
870 |
+
# interleave_timestamp_parser =
|
871 |
+
|
872 |
+
# Permissions in the form or of octal string as understood by chmod. The permissions are important
|
873 |
+
# when you use impersonation, when logs are written by a different user than airflow. The most secure
|
874 |
+
# way of configuring it in this case is to add both users to the same group and make it the default
|
875 |
+
# group of both users. Group-writeable logs are default in airflow, but you might decide that you are
|
876 |
+
# OK with having the logs other-writeable, in which case you should set it to ``0o777``. You might
|
877 |
+
# decide to add more security if you do not use impersonation and change it to ``0o755`` to make it
|
878 |
+
# only owner-writeable. You can also make it just readable only for owner by changing it to ``0o700``
|
879 |
+
# if all the access (read/write) for your logs happens from the same user.
|
880 |
+
#
|
881 |
+
# Example: file_task_handler_new_folder_permissions = 0o775
|
882 |
+
#
|
883 |
+
# Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FOLDER_PERMISSIONS
|
884 |
+
#
|
885 |
+
file_task_handler_new_folder_permissions = 0o775
|
886 |
+
|
887 |
+
# Permissions in the form or of octal string as understood by chmod. The permissions are important
|
888 |
+
# when you use impersonation, when logs are written by a different user than airflow. The most secure
|
889 |
+
# way of configuring it in this case is to add both users to the same group and make it the default
|
890 |
+
# group of both users. Group-writeable logs are default in airflow, but you might decide that you are
|
891 |
+
# OK with having the logs other-writeable, in which case you should set it to ``0o666``. You might
|
892 |
+
# decide to add more security if you do not use impersonation and change it to ``0o644`` to make it
|
893 |
+
# only owner-writeable. You can also make it just readable only for owner by changing it to ``0o600``
|
894 |
+
# if all the access (read/write) for your logs happens from the same user.
|
895 |
+
#
|
896 |
+
# Example: file_task_handler_new_file_permissions = 0o664
|
897 |
+
#
|
898 |
+
# Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FILE_PERMISSIONS
|
899 |
+
#
|
900 |
+
file_task_handler_new_file_permissions = 0o664
|
901 |
+
|
902 |
+
# By default Celery sends all logs into stderr.
|
903 |
+
# If enabled any previous logging handlers will get *removed*.
|
904 |
+
# With this option AirFlow will create new handlers
|
905 |
+
# and send low level logs like INFO and WARNING to stdout,
|
906 |
+
# while sending higher severity logs to stderr.
|
907 |
+
#
|
908 |
+
# Variable: AIRFLOW__LOGGING__CELERY_STDOUT_STDERR_SEPARATION
|
909 |
+
#
|
910 |
+
celery_stdout_stderr_separation = False
|
911 |
+
|
912 |
+
# If enabled, Airflow may ship messages to task logs from outside the task run context, e.g. from
|
913 |
+
# the scheduler, executor, or callback execution context. This can help in circumstances such as
|
914 |
+
# when there's something blocking the execution of the task and ordinarily there may be no task
|
915 |
+
# logs at all.
|
916 |
+
# This is set to ``True`` by default. If you encounter issues with this feature
|
917 |
+
# (e.g. scheduler performance issues) it can be disabled.
|
918 |
+
#
|
919 |
+
# Variable: AIRFLOW__LOGGING__ENABLE_TASK_CONTEXT_LOGGER
|
920 |
+
#
|
921 |
+
enable_task_context_logger = True
|
922 |
+
|
923 |
+
# A comma separated list of keywords related to errors whose presence should display the line in red
|
924 |
+
# color in UI
|
925 |
+
#
|
926 |
+
# Variable: AIRFLOW__LOGGING__COLOR_LOG_ERROR_KEYWORDS
|
927 |
+
#
|
928 |
+
color_log_error_keywords = error,exception
|
929 |
+
|
930 |
+
# A comma separated list of keywords related to warning whose presence should display the line in yellow
|
931 |
+
# color in UI
|
932 |
+
#
|
933 |
+
# Variable: AIRFLOW__LOGGING__COLOR_LOG_WARNING_KEYWORDS
|
934 |
+
#
|
935 |
+
color_log_warning_keywords = warn
|
936 |
+
|
937 |
+
[metrics]
|
938 |
+
# `StatsD <https://github.com/statsd/statsd>`__ integration settings.
|
939 |
+
|
940 |
+
# If true, ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` will use
|
941 |
+
# regex pattern matching anywhere within the metric name instead of only prefix matching
|
942 |
+
# at the start of the name.
|
943 |
+
#
|
944 |
+
# Variable: AIRFLOW__METRICS__METRICS_USE_PATTERN_MATCH
|
945 |
+
#
|
946 |
+
metrics_use_pattern_match = False
|
947 |
+
|
948 |
+
# Configure an allow list (comma separated string) to send only certain metrics.
|
949 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
|
950 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
|
951 |
+
#
|
952 |
+
# Example: metrics_allow_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
|
953 |
+
#
|
954 |
+
# Variable: AIRFLOW__METRICS__METRICS_ALLOW_LIST
|
955 |
+
#
|
956 |
+
metrics_allow_list =
|
957 |
+
|
958 |
+
# Configure a block list (comma separated string) to block certain metrics from being emitted.
|
959 |
+
# If ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` are both configured,
|
960 |
+
# ``[metrics] metrics_block_list`` is ignored.
|
961 |
+
#
|
962 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
|
963 |
+
#
|
964 |
+
# If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
|
965 |
+
#
|
966 |
+
# Example: metrics_block_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
|
967 |
+
#
|
968 |
+
# Variable: AIRFLOW__METRICS__METRICS_BLOCK_LIST
|
969 |
+
#
|
970 |
+
metrics_block_list =
|
971 |
+
|
972 |
+
# Enables sending metrics to StatsD.
|
973 |
+
#
|
974 |
+
# Variable: AIRFLOW__METRICS__STATSD_ON
|
975 |
+
#
|
976 |
+
statsd_on = False
|
977 |
+
|
978 |
+
# Specifies the host address where the StatsD daemon (or server) is running
|
979 |
+
#
|
980 |
+
# Variable: AIRFLOW__METRICS__STATSD_HOST
|
981 |
+
#
|
982 |
+
statsd_host = localhost
|
983 |
+
|
984 |
+
# Specifies the port on which the StatsD daemon (or server) is listening to
|
985 |
+
#
|
986 |
+
# Variable: AIRFLOW__METRICS__STATSD_PORT
|
987 |
+
#
|
988 |
+
statsd_port = 8125
|
989 |
+
|
990 |
+
# Defines the namespace for all metrics sent from Airflow to StatsD
|
991 |
+
#
|
992 |
+
# Variable: AIRFLOW__METRICS__STATSD_PREFIX
|
993 |
+
#
|
994 |
+
statsd_prefix = airflow
|
995 |
+
|
996 |
+
# A function that validate the StatsD stat name, apply changes to the stat name if necessary and return
|
997 |
+
# the transformed stat name.
|
998 |
+
#
|
999 |
+
# The function should have the following signature
|
1000 |
+
#
|
1001 |
+
# .. code-block:: python
|
1002 |
+
#
|
1003 |
+
# def func_name(stat_name: str) -> str: ...
|
1004 |
+
#
|
1005 |
+
# Variable: AIRFLOW__METRICS__STAT_NAME_HANDLER
|
1006 |
+
#
|
1007 |
+
stat_name_handler =
|
1008 |
+
|
1009 |
+
# To enable datadog integration to send airflow metrics.
|
1010 |
+
#
|
1011 |
+
# Variable: AIRFLOW__METRICS__STATSD_DATADOG_ENABLED
|
1012 |
+
#
|
1013 |
+
statsd_datadog_enabled = False
|
1014 |
+
|
1015 |
+
# List of datadog tags attached to all metrics(e.g: ``key1:value1,key2:value2``)
|
1016 |
+
#
|
1017 |
+
# Variable: AIRFLOW__METRICS__STATSD_DATADOG_TAGS
|
1018 |
+
#
|
1019 |
+
statsd_datadog_tags =
|
1020 |
+
|
1021 |
+
# Set to ``False`` to disable metadata tags for some of the emitted metrics
|
1022 |
+
#
|
1023 |
+
# Variable: AIRFLOW__METRICS__STATSD_DATADOG_METRICS_TAGS
|
1024 |
+
#
|
1025 |
+
statsd_datadog_metrics_tags = True
|
1026 |
+
|
1027 |
+
# If you want to utilise your own custom StatsD client set the relevant
|
1028 |
+
# module path below.
|
1029 |
+
# Note: The module path must exist on your
|
1030 |
+
# `PYTHONPATH <https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH>`
|
1031 |
+
# for Airflow to pick it up
|
1032 |
+
#
|
1033 |
+
# Variable: AIRFLOW__METRICS__STATSD_CUSTOM_CLIENT_PATH
|
1034 |
+
#
|
1035 |
+
# statsd_custom_client_path =
|
1036 |
+
|
1037 |
+
# If you want to avoid sending all the available metrics tags to StatsD,
|
1038 |
+
# you can configure a block list of prefixes (comma separated) to filter out metric tags
|
1039 |
+
# that start with the elements of the list (e.g: ``job_id,run_id``)
|
1040 |
+
#
|
1041 |
+
# Example: statsd_disabled_tags = job_id,run_id,dag_id,task_id
|
1042 |
+
#
|
1043 |
+
# Variable: AIRFLOW__METRICS__STATSD_DISABLED_TAGS
|
1044 |
+
#
|
1045 |
+
statsd_disabled_tags = job_id,run_id
|
1046 |
+
|
1047 |
+
# To enable sending Airflow metrics with StatsD-Influxdb tagging convention.
|
1048 |
+
#
|
1049 |
+
# Variable: AIRFLOW__METRICS__STATSD_INFLUXDB_ENABLED
|
1050 |
+
#
|
1051 |
+
statsd_influxdb_enabled = False
|
1052 |
+
|
1053 |
+
# Enables sending metrics to OpenTelemetry.
|
1054 |
+
#
|
1055 |
+
# Variable: AIRFLOW__METRICS__OTEL_ON
|
1056 |
+
#
|
1057 |
+
otel_on = False
|
1058 |
+
|
1059 |
+
# Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
|
1060 |
+
# metrics and traces.
|
1061 |
+
#
|
1062 |
+
# Variable: AIRFLOW__METRICS__OTEL_HOST
|
1063 |
+
#
|
1064 |
+
otel_host = localhost
|
1065 |
+
|
1066 |
+
# Specifies the port of the OpenTelemetry Collector that is listening to.
|
1067 |
+
#
|
1068 |
+
# Variable: AIRFLOW__METRICS__OTEL_PORT
|
1069 |
+
#
|
1070 |
+
otel_port = 8889
|
1071 |
+
|
1072 |
+
# The prefix for the Airflow metrics.
|
1073 |
+
#
|
1074 |
+
# Variable: AIRFLOW__METRICS__OTEL_PREFIX
|
1075 |
+
#
|
1076 |
+
otel_prefix = airflow
|
1077 |
+
|
1078 |
+
# Defines the interval, in milliseconds, at which Airflow sends batches of metrics and traces
|
1079 |
+
# to the configured OpenTelemetry Collector.
|
1080 |
+
#
|
1081 |
+
# Variable: AIRFLOW__METRICS__OTEL_INTERVAL_MILLISECONDS
|
1082 |
+
#
|
1083 |
+
otel_interval_milliseconds = 60000
|
1084 |
+
|
1085 |
+
# If ``True``, all metrics are also emitted to the console. Defaults to ``False``.
|
1086 |
+
#
|
1087 |
+
# Variable: AIRFLOW__METRICS__OTEL_DEBUGGING_ON
|
1088 |
+
#
|
1089 |
+
otel_debugging_on = False
|
1090 |
+
|
1091 |
+
# If ``True``, SSL will be enabled. Defaults to ``False``.
|
1092 |
+
# To establish an HTTPS connection to the OpenTelemetry collector,
|
1093 |
+
# you need to configure the SSL certificate and key within the OpenTelemetry collector's
|
1094 |
+
# ``config.yml`` file.
|
1095 |
+
#
|
1096 |
+
# Variable: AIRFLOW__METRICS__OTEL_SSL_ACTIVE
|
1097 |
+
#
|
1098 |
+
otel_ssl_active = False
|
1099 |
+
|
1100 |
+
[traces]
|
1101 |
+
# Distributed traces integration settings.
|
1102 |
+
|
1103 |
+
# Enables sending traces to OpenTelemetry.
|
1104 |
+
#
|
1105 |
+
# Variable: AIRFLOW__TRACES__OTEL_ON
|
1106 |
+
#
|
1107 |
+
otel_on = False
|
1108 |
+
|
1109 |
+
# Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
|
1110 |
+
# traces.
|
1111 |
+
#
|
1112 |
+
# Variable: AIRFLOW__TRACES__OTEL_HOST
|
1113 |
+
#
|
1114 |
+
otel_host = localhost
|
1115 |
+
|
1116 |
+
# Specifies the port of the OpenTelemetry Collector that is listening to.
|
1117 |
+
#
|
1118 |
+
# Variable: AIRFLOW__TRACES__OTEL_PORT
|
1119 |
+
#
|
1120 |
+
otel_port = 8889
|
1121 |
+
|
1122 |
+
# The default service name of traces.
|
1123 |
+
#
|
1124 |
+
# Variable: AIRFLOW__TRACES__OTEL_SERVICE
|
1125 |
+
#
|
1126 |
+
otel_service = Airflow
|
1127 |
+
|
1128 |
+
# If True, all traces are also emitted to the console. Defaults to False.
|
1129 |
+
#
|
1130 |
+
# Variable: AIRFLOW__TRACES__OTEL_DEBUGGING_ON
|
1131 |
+
#
|
1132 |
+
otel_debugging_on = False
|
1133 |
+
|
1134 |
+
# If True, SSL will be enabled. Defaults to False.
|
1135 |
+
# To establish an HTTPS connection to the OpenTelemetry collector,
|
1136 |
+
# you need to configure the SSL certificate and key within the OpenTelemetry collector's
|
1137 |
+
# config.yml file.
|
1138 |
+
#
|
1139 |
+
# Variable: AIRFLOW__TRACES__OTEL_SSL_ACTIVE
|
1140 |
+
#
|
1141 |
+
otel_ssl_active = False
|
1142 |
+
|
1143 |
+
# If True, after the task is complete, the full task log messages will be added as the
|
1144 |
+
# span events, chunked by 64k size. defaults to False.
|
1145 |
+
#
|
1146 |
+
# Variable: AIRFLOW__TRACES__OTEL_TASK_LOG_EVENT
|
1147 |
+
#
|
1148 |
+
otel_task_log_event = False
|
1149 |
+
|
1150 |
+
[secrets]
|
1151 |
+
# Full class name of secrets backend to enable (will precede env vars and metastore in search path)
|
1152 |
+
#
|
1153 |
+
# Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
|
1154 |
+
#
|
1155 |
+
# Variable: AIRFLOW__SECRETS__BACKEND
|
1156 |
+
#
|
1157 |
+
backend =
|
1158 |
+
|
1159 |
+
# The backend_kwargs param is loaded into a dictionary and passed to ``__init__``
|
1160 |
+
# of secrets backend class. See documentation for the secrets backend you are using.
|
1161 |
+
# JSON is expected.
|
1162 |
+
#
|
1163 |
+
# Example for AWS Systems Manager ParameterStore:
|
1164 |
+
# ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
|
1165 |
+
#
|
1166 |
+
# Variable: AIRFLOW__SECRETS__BACKEND_KWARGS
|
1167 |
+
#
|
1168 |
+
backend_kwargs =
|
1169 |
+
|
1170 |
+
# .. note:: |experimental|
|
1171 |
+
#
|
1172 |
+
# Enables local caching of Variables, when parsing DAGs only.
|
1173 |
+
# Using this option can make dag parsing faster if Variables are used in top level code, at the expense
|
1174 |
+
# of longer propagation time for changes.
|
1175 |
+
# Please note that this cache concerns only the DAG parsing step. There is no caching in place when DAG
|
1176 |
+
# tasks are run.
|
1177 |
+
#
|
1178 |
+
# Variable: AIRFLOW__SECRETS__USE_CACHE
|
1179 |
+
#
|
1180 |
+
use_cache = False
|
1181 |
+
|
1182 |
+
# .. note:: |experimental|
|
1183 |
+
#
|
1184 |
+
# When the cache is enabled, this is the duration for which we consider an entry in the cache to be
|
1185 |
+
# valid. Entries are refreshed if they are older than this many seconds.
|
1186 |
+
# It means that when the cache is enabled, this is the maximum amount of time you need to wait to see a
|
1187 |
+
# Variable change take effect.
|
1188 |
+
#
|
1189 |
+
# Variable: AIRFLOW__SECRETS__CACHE_TTL_SECONDS
|
1190 |
+
#
|
1191 |
+
cache_ttl_seconds = 900
|
1192 |
+
|
1193 |
+
[cli]
|
1194 |
+
# In what way should the cli access the API. The LocalClient will use the
|
1195 |
+
# database directly, while the json_client will use the api running on the
|
1196 |
+
# webserver
|
1197 |
+
#
|
1198 |
+
# Variable: AIRFLOW__CLI__API_CLIENT
|
1199 |
+
#
|
1200 |
+
api_client = airflow.api.client.local_client
|
1201 |
+
|
1202 |
+
# If you set web_server_url_prefix, do NOT forget to append it here, ex:
|
1203 |
+
# ``endpoint_url = http://localhost:8080/myroot``
|
1204 |
+
# So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
|
1205 |
+
#
|
1206 |
+
# Variable: AIRFLOW__CLI__ENDPOINT_URL
|
1207 |
+
#
|
1208 |
+
endpoint_url = http://localhost:8080
|
1209 |
+
|
1210 |
+
[debug]
|
1211 |
+
# Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
|
1212 |
+
# failed task. Helpful for debugging purposes.
|
1213 |
+
#
|
1214 |
+
# Variable: AIRFLOW__DEBUG__FAIL_FAST
|
1215 |
+
#
|
1216 |
+
fail_fast = False
|
1217 |
+
|
1218 |
+
[api]
|
1219 |
+
# Enables the deprecated experimental API. Please note that these API endpoints do not have
|
1220 |
+
# access control. An authenticated user has full access.
|
1221 |
+
#
|
1222 |
+
# .. warning::
|
1223 |
+
#
|
1224 |
+
# This `Experimental REST API
|
1225 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/deprecated-rest-api-ref.html>`__ is
|
1226 |
+
# deprecated since version 2.0. Please consider using
|
1227 |
+
# `the Stable REST API
|
1228 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/stable-rest-api-ref.html>`__.
|
1229 |
+
# For more information on migration, see
|
1230 |
+
# `RELEASE_NOTES.rst <https://github.com/apache/airflow/blob/main/RELEASE_NOTES.rst>`_
|
1231 |
+
#
|
1232 |
+
# Variable: AIRFLOW__API__ENABLE_EXPERIMENTAL_API
|
1233 |
+
#
|
1234 |
+
enable_experimental_api = False
|
1235 |
+
|
1236 |
+
# Comma separated list of auth backends to authenticate users of the API. See
|
1237 |
+
# `Security: API
|
1238 |
+
# <https://airflow.apache.org/docs/apache-airflow/stable/security/api.html>`__ for possible values.
|
1239 |
+
# ("airflow.api.auth.backend.default" allows all requests for historic reasons)
|
1240 |
+
#
|
1241 |
+
# Variable: AIRFLOW__API__AUTH_BACKENDS
|
1242 |
+
#
|
1243 |
+
auth_backends = airflow.api.auth.backend.session
|
1244 |
+
|
1245 |
+
# Used to set the maximum page limit for API requests. If limit passed as param
|
1246 |
+
# is greater than maximum page limit, it will be ignored and maximum page limit value
|
1247 |
+
# will be set as the limit
|
1248 |
+
#
|
1249 |
+
# Variable: AIRFLOW__API__MAXIMUM_PAGE_LIMIT
|
1250 |
+
#
|
1251 |
+
maximum_page_limit = 100
|
1252 |
+
|
1253 |
+
# Used to set the default page limit when limit param is zero or not provided in API
|
1254 |
+
# requests. Otherwise if positive integer is passed in the API requests as limit, the
|
1255 |
+
# smallest number of user given limit or maximum page limit is taken as limit.
|
1256 |
+
#
|
1257 |
+
# Variable: AIRFLOW__API__FALLBACK_PAGE_LIMIT
|
1258 |
+
#
|
1259 |
+
fallback_page_limit = 100
|
1260 |
+
|
1261 |
+
# The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
|
1262 |
+
#
|
1263 |
+
# Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
|
1264 |
+
#
|
1265 |
+
# Variable: AIRFLOW__API__GOOGLE_OAUTH2_AUDIENCE
|
1266 |
+
#
|
1267 |
+
google_oauth2_audience =
|
1268 |
+
|
1269 |
+
# Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
|
1270 |
+
# `the Application Default Credentials
|
1271 |
+
# <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
|
1272 |
+
# be used.
|
1273 |
+
#
|
1274 |
+
# Example: google_key_path = /files/service-account-json
|
1275 |
+
#
|
1276 |
+
# Variable: AIRFLOW__API__GOOGLE_KEY_PATH
|
1277 |
+
#
|
1278 |
+
google_key_path =
|
1279 |
+
|
1280 |
+
# Used in response to a preflight request to indicate which HTTP
|
1281 |
+
# headers can be used when making the actual request. This header is
|
1282 |
+
# the server side response to the browser's
|
1283 |
+
# Access-Control-Request-Headers header.
|
1284 |
+
#
|
1285 |
+
# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_HEADERS
|
1286 |
+
#
|
1287 |
+
access_control_allow_headers =
|
1288 |
+
|
1289 |
+
# Specifies the method or methods allowed when accessing the resource.
|
1290 |
+
#
|
1291 |
+
# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_METHODS
|
1292 |
+
#
|
1293 |
+
access_control_allow_methods =
|
1294 |
+
|
1295 |
+
# Indicates whether the response can be shared with requesting code from the given origins.
|
1296 |
+
# Separate URLs with space.
|
1297 |
+
#
|
1298 |
+
# Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_ORIGINS
|
1299 |
+
#
|
1300 |
+
access_control_allow_origins =
|
1301 |
+
|
1302 |
+
# Indicates whether the **xcomEntries** endpoint supports the **deserialize**
|
1303 |
+
# flag. If set to ``False``, setting this flag in a request would result in a
|
1304 |
+
# 400 Bad Request error.
|
1305 |
+
#
|
1306 |
+
# Variable: AIRFLOW__API__ENABLE_XCOM_DESERIALIZE_SUPPORT
|
1307 |
+
#
|
1308 |
+
enable_xcom_deserialize_support = False
|
1309 |
+
|
1310 |
+
[lineage]
|
1311 |
+
# what lineage backend to use
|
1312 |
+
#
|
1313 |
+
# Variable: AIRFLOW__LINEAGE__BACKEND
|
1314 |
+
#
|
1315 |
+
backend =
|
1316 |
+
|
1317 |
+
[operators]
|
1318 |
+
# The default owner assigned to each new operator, unless
|
1319 |
+
# provided explicitly or passed via ``default_args``
|
1320 |
+
#
|
1321 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_OWNER
|
1322 |
+
#
|
1323 |
+
default_owner = airflow
|
1324 |
+
|
1325 |
+
# The default value of attribute "deferrable" in operators and sensors.
|
1326 |
+
#
|
1327 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_DEFERRABLE
|
1328 |
+
#
|
1329 |
+
default_deferrable = false
|
1330 |
+
|
1331 |
+
# Indicates the default number of CPU units allocated to each operator when no specific CPU request
|
1332 |
+
# is specified in the operator's configuration
|
1333 |
+
#
|
1334 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_CPUS
|
1335 |
+
#
|
1336 |
+
default_cpus = 1
|
1337 |
+
|
1338 |
+
# Indicates the default number of RAM allocated to each operator when no specific RAM request
|
1339 |
+
# is specified in the operator's configuration
|
1340 |
+
#
|
1341 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_RAM
|
1342 |
+
#
|
1343 |
+
default_ram = 512
|
1344 |
+
|
1345 |
+
# Indicates the default number of disk storage allocated to each operator when no specific disk request
|
1346 |
+
# is specified in the operator's configuration
|
1347 |
+
#
|
1348 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_DISK
|
1349 |
+
#
|
1350 |
+
default_disk = 512
|
1351 |
+
|
1352 |
+
# Indicates the default number of GPUs allocated to each operator when no specific GPUs request
|
1353 |
+
# is specified in the operator's configuration
|
1354 |
+
#
|
1355 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_GPUS
|
1356 |
+
#
|
1357 |
+
default_gpus = 0
|
1358 |
+
|
1359 |
+
# Default queue that tasks get assigned to and that worker listen on.
|
1360 |
+
#
|
1361 |
+
# Variable: AIRFLOW__OPERATORS__DEFAULT_QUEUE
|
1362 |
+
#
|
1363 |
+
default_queue = default
|
1364 |
+
|
1365 |
+
# Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
|
1366 |
+
# If set to ``False``, an exception will be thrown,
|
1367 |
+
# otherwise only the console message will be displayed.
|
1368 |
+
#
|
1369 |
+
# Variable: AIRFLOW__OPERATORS__ALLOW_ILLEGAL_ARGUMENTS
|
1370 |
+
#
|
1371 |
+
allow_illegal_arguments = False
|
1372 |
+
|
1373 |
+
[webserver]
|
1374 |
+
# The message displayed when a user attempts to execute actions beyond their authorised privileges.
|
1375 |
+
#
|
1376 |
+
# Variable: AIRFLOW__WEBSERVER__ACCESS_DENIED_MESSAGE
|
1377 |
+
#
|
1378 |
+
access_denied_message = Access is Denied
|
1379 |
+
|
1380 |
+
# Path of webserver config file used for configuring the webserver parameters
|
1381 |
+
#
|
1382 |
+
# Variable: AIRFLOW__WEBSERVER__CONFIG_FILE
|
1383 |
+
#
|
1384 |
+
config_file = /opt/airflow/webserver_config.py
|
1385 |
+
|
1386 |
+
# The base url of your website: Airflow cannot guess what domain or CNAME you are using.
|
1387 |
+
# This is used to create links in the Log Url column in the Browse - Task Instances menu,
|
1388 |
+
# as well as in any automated emails sent by Airflow that contain links to your webserver.
|
1389 |
+
#
|
1390 |
+
# Variable: AIRFLOW__WEBSERVER__BASE_URL
|
1391 |
+
#
|
1392 |
+
base_url = http://localhost:8080
|
1393 |
+
|
1394 |
+
# Default timezone to display all dates in the UI, can be UTC, system, or
|
1395 |
+
# any IANA timezone string (e.g. **Europe/Amsterdam**). If left empty the
|
1396 |
+
# default value of core/default_timezone will be used
|
1397 |
+
#
|
1398 |
+
# Example: default_ui_timezone = America/New_York
|
1399 |
+
#
|
1400 |
+
# Variable: AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE
|
1401 |
+
#
|
1402 |
+
default_ui_timezone = UTC
|
1403 |
+
|
1404 |
+
# The ip specified when starting the web server
|
1405 |
+
#
|
1406 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_HOST
|
1407 |
+
#
|
1408 |
+
web_server_host = 0.0.0.0
|
1409 |
+
|
1410 |
+
# The port on which to run the web server
|
1411 |
+
#
|
1412 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_PORT
|
1413 |
+
#
|
1414 |
+
web_server_port = 8080
|
1415 |
+
|
1416 |
+
# Paths to the SSL certificate and key for the web server. When both are
|
1417 |
+
# provided SSL will be enabled. This does not change the web server port.
|
1418 |
+
#
|
1419 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_CERT
|
1420 |
+
#
|
1421 |
+
web_server_ssl_cert =
|
1422 |
+
|
1423 |
+
# Paths to the SSL certificate and key for the web server. When both are
|
1424 |
+
# provided SSL will be enabled. This does not change the web server port.
|
1425 |
+
#
|
1426 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_KEY
|
1427 |
+
#
|
1428 |
+
web_server_ssl_key =
|
1429 |
+
|
1430 |
+
# The type of backend used to store web session data, can be ``database`` or ``securecookie``. For the
|
1431 |
+
# ``database`` backend, sessions are store in the database and they can be
|
1432 |
+
# managed there (for example when you reset password of the user, all sessions for that user are
|
1433 |
+
# deleted). For the ``securecookie`` backend, sessions are stored in encrypted cookies on the client
|
1434 |
+
# side. The ``securecookie`` mechanism is 'lighter' than database backend, but sessions are not deleted
|
1435 |
+
# when you reset password of the user, which means that other than waiting for expiry time, the only
|
1436 |
+
# way to invalidate all sessions for a user is to change secret_key and restart webserver (which
|
1437 |
+
# also invalidates and logs out all other user's sessions).
|
1438 |
+
#
|
1439 |
+
# When you are using ``database`` backend, make sure to keep your database session table small
|
1440 |
+
# by periodically running ``airflow db clean --table session`` command, especially if you have
|
1441 |
+
# automated API calls that will create a new session for each call rather than reuse the sessions
|
1442 |
+
# stored in browser cookies.
|
1443 |
+
#
|
1444 |
+
# Example: session_backend = securecookie
|
1445 |
+
#
|
1446 |
+
# Variable: AIRFLOW__WEBSERVER__SESSION_BACKEND
|
1447 |
+
#
|
1448 |
+
session_backend = database
|
1449 |
+
|
1450 |
+
# Number of seconds the webserver waits before killing gunicorn master that doesn't respond
|
1451 |
+
#
|
1452 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_MASTER_TIMEOUT
|
1453 |
+
#
|
1454 |
+
web_server_master_timeout = 120
|
1455 |
+
|
1456 |
+
# Number of seconds the gunicorn webserver waits before timing out on a worker
|
1457 |
+
#
|
1458 |
+
# Variable: AIRFLOW__WEBSERVER__WEB_SERVER_WORKER_TIMEOUT
|
1459 |
+
#
|
1460 |
+
web_server_worker_timeout = 120
|
1461 |
+
|
1462 |
+
# Number of workers to refresh at a time. When set to 0, worker refresh is
|
1463 |
+
# disabled. When nonzero, airflow periodically refreshes webserver workers by
|
1464 |
+
# bringing up new ones and killing old ones.
|
1465 |
+
#
|
1466 |
+
# Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_BATCH_SIZE
|
1467 |
+
#
|
1468 |
+
worker_refresh_batch_size = 1
|
1469 |
+
|
1470 |
+
# Number of seconds to wait before refreshing a batch of workers.
|
1471 |
+
#
|
1472 |
+
# Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL
|
1473 |
+
#
|
1474 |
+
worker_refresh_interval = 6000
|
1475 |
+
|
1476 |
+
# If set to ``True``, Airflow will track files in plugins_folder directory. When it detects changes,
|
1477 |
+
# then reload the gunicorn. If set to ``True``, gunicorn starts without preloading, which is slower,
|
1478 |
+
# uses more memory, and may cause race conditions. Avoid setting this to ``True`` in production.
|
1479 |
+
#
|
1480 |
+
# Variable: AIRFLOW__WEBSERVER__RELOAD_ON_PLUGIN_CHANGE
|
1481 |
+
#
|
1482 |
+
reload_on_plugin_change = False
|
1483 |
+
|
1484 |
+
# Secret key used to run your flask app. It should be as random as possible. However, when running
|
1485 |
+
# more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise
|
1486 |
+
# one of them will error with "CSRF session token is missing".
|
1487 |
+
# The webserver key is also used to authorize requests to Celery workers when logs are retrieved.
|
1488 |
+
# The token generated using the secret key has a short expiry time though - make sure that time on
|
1489 |
+
# ALL the machines that you run airflow components on is synchronized (for example using ntpd)
|
1490 |
+
# otherwise you might get "forbidden" errors when the logs are accessed.
|
1491 |
+
#
|
1492 |
+
# Variable: AIRFLOW__WEBSERVER__SECRET_KEY
|
1493 |
+
#
|
1494 |
+
secret_key = pDBrar+Nyodn8Sf5ZeJfVw==
|
1495 |
+
|
1496 |
+
# Number of workers to run the Gunicorn web server
|
1497 |
+
#
|
1498 |
+
# Variable: AIRFLOW__WEBSERVER__WORKERS
|
1499 |
+
#
|
1500 |
+
workers = 4
|
1501 |
+
|
1502 |
+
# The worker class gunicorn should use. Choices include
|
1503 |
+
# ``sync`` (default), ``eventlet``, ``gevent``.
|
1504 |
+
#
|
1505 |
+
# .. warning::
|
1506 |
+
#
|
1507 |
+
# When using ``gevent`` you might also want to set the ``_AIRFLOW_PATCH_GEVENT``
|
1508 |
+
# environment variable to ``"1"`` to make sure gevent patching is done as early as possible.
|
1509 |
+
#
|
1510 |
+
# Be careful to set ``_AIRFLOW_PATCH_GEVENT`` only on the web server as gevent patching may
|
1511 |
+
# affect the scheduler behavior via the ``multiprocessing`` sockets module and cause crash.
|
1512 |
+
#
|
1513 |
+
# See related Issues / PRs for more details:
|
1514 |
+
#
|
1515 |
+
# * https://github.com/benoitc/gunicorn/issues/2796
|
1516 |
+
# * https://github.com/apache/airflow/issues/8212
|
1517 |
+
# * https://github.com/apache/airflow/pull/28283
|
1518 |
+
#
|
1519 |
+
# Variable: AIRFLOW__WEBSERVER__WORKER_CLASS
|
1520 |
+
#
|
1521 |
+
worker_class = sync
|
1522 |
+
|
1523 |
+
# Log files for the gunicorn webserver. '-' means log to stderr.
|
1524 |
+
#
|
1525 |
+
# Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFILE
|
1526 |
+
#
|
1527 |
+
access_logfile = -
|
1528 |
+
|
1529 |
+
# Log files for the gunicorn webserver. '-' means log to stderr.
|
1530 |
+
#
|
1531 |
+
# Variable: AIRFLOW__WEBSERVER__ERROR_LOGFILE
|
1532 |
+
#
|
1533 |
+
error_logfile = -
|
1534 |
+
|
1535 |
+
# Access log format for gunicorn webserver.
|
1536 |
+
# default format is ``%%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"``
|
1537 |
+
# See `Gunicorn Settings: 'access_log_format' Reference
|
1538 |
+
# <https://docs.gunicorn.org/en/stable/settings.html#access-log-format>`__ for more details
|
1539 |
+
#
|
1540 |
+
# Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFORMAT
|
1541 |
+
#
|
1542 |
+
access_logformat =
|
1543 |
+
|
1544 |
+
# Expose the configuration file in the web server. Set to ``non-sensitive-only`` to show all values
|
1545 |
+
# except those that have security implications. ``True`` shows all values. ``False`` hides the
|
1546 |
+
# configuration completely.
|
1547 |
+
#
|
1548 |
+
# Variable: AIRFLOW__WEBSERVER__EXPOSE_CONFIG
|
1549 |
+
#
|
1550 |
+
expose_config = False
|
1551 |
+
|
1552 |
+
# Expose hostname in the web server
|
1553 |
+
#
|
1554 |
+
# Variable: AIRFLOW__WEBSERVER__EXPOSE_HOSTNAME
|
1555 |
+
#
|
1556 |
+
expose_hostname = False
|
1557 |
+
|
1558 |
+
# Expose stacktrace in the web server
|
1559 |
+
#
|
1560 |
+
# Variable: AIRFLOW__WEBSERVER__EXPOSE_STACKTRACE
|
1561 |
+
#
|
1562 |
+
expose_stacktrace = False
|
1563 |
+
|
1564 |
+
# Default DAG view. Valid values are: ``grid``, ``graph``, ``duration``, ``gantt``, ``landing_times``
|
1565 |
+
#
|
1566 |
+
# Variable: AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW
|
1567 |
+
#
|
1568 |
+
dag_default_view = grid
|
1569 |
+
|
1570 |
+
# Default DAG orientation. Valid values are:
|
1571 |
+
# ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
|
1572 |
+
#
|
1573 |
+
# Variable: AIRFLOW__WEBSERVER__DAG_ORIENTATION
|
1574 |
+
#
|
1575 |
+
dag_orientation = LR
|
1576 |
+
|
1577 |
+
# Sorting order in grid view. Valid values are: ``topological``, ``hierarchical_alphabetical``
|
1578 |
+
#
|
1579 |
+
# Variable: AIRFLOW__WEBSERVER__GRID_VIEW_SORTING_ORDER
|
1580 |
+
#
|
1581 |
+
grid_view_sorting_order = topological
|
1582 |
+
|
1583 |
+
# The amount of time (in secs) webserver will wait for initial handshake
|
1584 |
+
# while fetching logs from other worker machine
|
1585 |
+
#
|
1586 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC
|
1587 |
+
#
|
1588 |
+
log_fetch_timeout_sec = 5
|
1589 |
+
|
1590 |
+
# Time interval (in secs) to wait before next log fetching.
|
1591 |
+
#
|
1592 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_FETCH_DELAY_SEC
|
1593 |
+
#
|
1594 |
+
log_fetch_delay_sec = 2
|
1595 |
+
|
1596 |
+
# Distance away from page bottom to enable auto tailing.
|
1597 |
+
#
|
1598 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_AUTO_TAILING_OFFSET
|
1599 |
+
#
|
1600 |
+
log_auto_tailing_offset = 30
|
1601 |
+
|
1602 |
+
# Animation speed for auto tailing log display.
|
1603 |
+
#
|
1604 |
+
# Variable: AIRFLOW__WEBSERVER__LOG_ANIMATION_SPEED
|
1605 |
+
#
|
1606 |
+
log_animation_speed = 1000
|
1607 |
+
|
1608 |
+
# By default, the webserver shows paused DAGs. Flip this to hide paused
|
1609 |
+
# DAGs by default
|
1610 |
+
#
|
1611 |
+
# Variable: AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT
|
1612 |
+
#
|
1613 |
+
hide_paused_dags_by_default = False
|
1614 |
+
|
1615 |
+
# Consistent page size across all listing views in the UI
|
1616 |
+
#
|
1617 |
+
# Variable: AIRFLOW__WEBSERVER__PAGE_SIZE
|
1618 |
+
#
|
1619 |
+
page_size = 100
|
1620 |
+
|
1621 |
+
# Define the color of navigation bar
|
1622 |
+
#
|
1623 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_COLOR
|
1624 |
+
#
|
1625 |
+
navbar_color = #fff
|
1626 |
+
|
1627 |
+
# Define the color of text in the navigation bar
|
1628 |
+
#
|
1629 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_COLOR
|
1630 |
+
#
|
1631 |
+
navbar_text_color = #51504f
|
1632 |
+
|
1633 |
+
# Define the color of navigation bar links when hovered
|
1634 |
+
#
|
1635 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_HOVER_COLOR
|
1636 |
+
#
|
1637 |
+
navbar_hover_color = #eee
|
1638 |
+
|
1639 |
+
# Define the color of text in the navigation bar when hovered
|
1640 |
+
#
|
1641 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_HOVER_COLOR
|
1642 |
+
#
|
1643 |
+
navbar_text_hover_color = #51504f
|
1644 |
+
|
1645 |
+
# Define the color of the logo text
|
1646 |
+
#
|
1647 |
+
# Variable: AIRFLOW__WEBSERVER__NAVBAR_LOGO_TEXT_COLOR
|
1648 |
+
#
|
1649 |
+
navbar_logo_text_color = #51504f
|
1650 |
+
|
1651 |
+
# Default dagrun to show in UI
|
1652 |
+
#
|
1653 |
+
# Variable: AIRFLOW__WEBSERVER__DEFAULT_DAG_RUN_DISPLAY_NUMBER
|
1654 |
+
#
|
1655 |
+
default_dag_run_display_number = 25
|
1656 |
+
|
1657 |
+
# Enable werkzeug ``ProxyFix`` middleware for reverse proxy
|
1658 |
+
#
|
1659 |
+
# Variable: AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX
|
1660 |
+
#
|
1661 |
+
enable_proxy_fix = False
|
1662 |
+
|
1663 |
+
# Number of values to trust for ``X-Forwarded-For``.
|
1664 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
1665 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
1666 |
+
#
|
1667 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_FOR
|
1668 |
+
#
|
1669 |
+
proxy_fix_x_for = 1
|
1670 |
+
|
1671 |
+
# Number of values to trust for ``X-Forwarded-Proto``.
|
1672 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
1673 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
1674 |
+
#
|
1675 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PROTO
|
1676 |
+
#
|
1677 |
+
proxy_fix_x_proto = 1
|
1678 |
+
|
1679 |
+
# Number of values to trust for ``X-Forwarded-Host``.
|
1680 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
1681 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
1682 |
+
#
|
1683 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_HOST
|
1684 |
+
#
|
1685 |
+
proxy_fix_x_host = 1
|
1686 |
+
|
1687 |
+
# Number of values to trust for ``X-Forwarded-Port``.
|
1688 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
1689 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
1690 |
+
#
|
1691 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PORT
|
1692 |
+
#
|
1693 |
+
proxy_fix_x_port = 1
|
1694 |
+
|
1695 |
+
# Number of values to trust for ``X-Forwarded-Prefix``.
|
1696 |
+
# See `Werkzeug: X-Forwarded-For Proxy Fix
|
1697 |
+
# <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
|
1698 |
+
#
|
1699 |
+
# Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PREFIX
|
1700 |
+
#
|
1701 |
+
proxy_fix_x_prefix = 1
|
1702 |
+
|
1703 |
+
# Set secure flag on session cookie
|
1704 |
+
#
|
1705 |
+
# Variable: AIRFLOW__WEBSERVER__COOKIE_SECURE
|
1706 |
+
#
|
1707 |
+
cookie_secure = False
|
1708 |
+
|
1709 |
+
# Set samesite policy on session cookie
|
1710 |
+
#
|
1711 |
+
# Variable: AIRFLOW__WEBSERVER__COOKIE_SAMESITE
|
1712 |
+
#
|
1713 |
+
cookie_samesite = Lax
|
1714 |
+
|
1715 |
+
# Default setting for wrap toggle on DAG code and TI log views.
|
1716 |
+
#
|
1717 |
+
# Variable: AIRFLOW__WEBSERVER__DEFAULT_WRAP
|
1718 |
+
#
|
1719 |
+
default_wrap = False
|
1720 |
+
|
1721 |
+
# Allow the UI to be rendered in a frame
|
1722 |
+
#
|
1723 |
+
# Variable: AIRFLOW__WEBSERVER__X_FRAME_ENABLED
|
1724 |
+
#
|
1725 |
+
x_frame_enabled = True
|
1726 |
+
|
1727 |
+
# Send anonymous user activity to your analytics tool
|
1728 |
+
# choose from ``google_analytics``, ``segment``, ``metarouter``, or ``matomo``
|
1729 |
+
#
|
1730 |
+
# Variable: AIRFLOW__WEBSERVER__ANALYTICS_TOOL
|
1731 |
+
#
|
1732 |
+
# analytics_tool =
|
1733 |
+
|
1734 |
+
# Unique ID of your account in the analytics tool
|
1735 |
+
#
|
1736 |
+
# Variable: AIRFLOW__WEBSERVER__ANALYTICS_ID
|
1737 |
+
#
|
1738 |
+
# analytics_id =
|
1739 |
+
|
1740 |
+
# Your instances url, only applicable to Matomo.
|
1741 |
+
#
|
1742 |
+
# Example: analytics_url = https://your.matomo.instance.com/
|
1743 |
+
#
|
1744 |
+
# Variable: AIRFLOW__WEBSERVER__ANALYTICS_URL
|
1745 |
+
#
|
1746 |
+
# analytics_url =
|
1747 |
+
|
1748 |
+
# 'Recent Tasks' stats will show for old DagRuns if set
|
1749 |
+
#
|
1750 |
+
# Variable: AIRFLOW__WEBSERVER__SHOW_RECENT_STATS_FOR_COMPLETED_RUNS
|
1751 |
+
#
|
1752 |
+
show_recent_stats_for_completed_runs = True
|
1753 |
+
|
1754 |
+
# The UI cookie lifetime in minutes. User will be logged out from UI after
|
1755 |
+
# ``[webserver] session_lifetime_minutes`` of non-activity
|
1756 |
+
#
|
1757 |
+
# Variable: AIRFLOW__WEBSERVER__SESSION_LIFETIME_MINUTES
|
1758 |
+
#
|
1759 |
+
session_lifetime_minutes = 43200
|
1760 |
+
|
1761 |
+
# Sets a custom page title for the DAGs overview page and site title for all pages
|
1762 |
+
#
|
1763 |
+
# Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME
|
1764 |
+
#
|
1765 |
+
# instance_name =
|
1766 |
+
|
1767 |
+
# Whether the custom page title for the DAGs overview page contains any Markup language
|
1768 |
+
#
|
1769 |
+
# Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME_HAS_MARKUP
|
1770 |
+
#
|
1771 |
+
instance_name_has_markup = False
|
1772 |
+
|
1773 |
+
# How frequently, in seconds, the DAG data will auto-refresh in graph or grid view
|
1774 |
+
# when auto-refresh is turned on
|
1775 |
+
#
|
1776 |
+
# Variable: AIRFLOW__WEBSERVER__AUTO_REFRESH_INTERVAL
|
1777 |
+
#
|
1778 |
+
auto_refresh_interval = 3
|
1779 |
+
|
1780 |
+
# Boolean for displaying warning for publicly viewable deployment
|
1781 |
+
#
|
1782 |
+
# Variable: AIRFLOW__WEBSERVER__WARN_DEPLOYMENT_EXPOSURE
|
1783 |
+
#
|
1784 |
+
warn_deployment_exposure = True
|
1785 |
+
|
1786 |
+
# Comma separated string of view events to exclude from dag audit view.
|
1787 |
+
# All other events will be added minus the ones passed here.
|
1788 |
+
# The audit logs in the db will not be affected by this parameter.
|
1789 |
+
#
|
1790 |
+
# Example: audit_view_excluded_events = cli_task_run,running,success
|
1791 |
+
#
|
1792 |
+
# Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_EXCLUDED_EVENTS
|
1793 |
+
#
|
1794 |
+
# audit_view_excluded_events =
|
1795 |
+
|
1796 |
+
# Comma separated string of view events to include in dag audit view.
|
1797 |
+
# If passed, only these events will populate the dag audit view.
|
1798 |
+
# The audit logs in the db will not be affected by this parameter.
|
1799 |
+
#
|
1800 |
+
# Example: audit_view_included_events = dagrun_cleared,failed
|
1801 |
+
#
|
1802 |
+
# Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_INCLUDED_EVENTS
|
1803 |
+
#
|
1804 |
+
# audit_view_included_events =
|
1805 |
+
|
1806 |
+
# Boolean for running SwaggerUI in the webserver.
|
1807 |
+
#
|
1808 |
+
# Variable: AIRFLOW__WEBSERVER__ENABLE_SWAGGER_UI
|
1809 |
+
#
|
1810 |
+
enable_swagger_ui = True
|
1811 |
+
|
1812 |
+
# Boolean for running Internal API in the webserver.
|
1813 |
+
#
|
1814 |
+
# Variable: AIRFLOW__WEBSERVER__RUN_INTERNAL_API
|
1815 |
+
#
|
1816 |
+
run_internal_api = False
|
1817 |
+
|
1818 |
+
# The caching algorithm used by the webserver. Must be a valid hashlib function name.
|
1819 |
+
#
|
1820 |
+
# Example: caching_hash_method = sha256
|
1821 |
+
#
|
1822 |
+
# Variable: AIRFLOW__WEBSERVER__CACHING_HASH_METHOD
|
1823 |
+
#
|
1824 |
+
caching_hash_method = md5
|
1825 |
+
|
1826 |
+
# Behavior of the trigger DAG run button for DAGs without params. ``False`` to skip and trigger
|
1827 |
+
# without displaying a form to add a **dag_run.conf**, ``True`` to always display the form.
|
1828 |
+
# The form is displayed always if parameters are defined.
|
1829 |
+
#
|
1830 |
+
# Variable: AIRFLOW__WEBSERVER__SHOW_TRIGGER_FORM_IF_NO_PARAMS
|
1831 |
+
#
|
1832 |
+
show_trigger_form_if_no_params = False
|
1833 |
+
|
1834 |
+
# Number of recent DAG run configurations in the selector on the trigger web form.
|
1835 |
+
#
|
1836 |
+
# Example: num_recent_configurations_for_trigger = 10
|
1837 |
+
#
|
1838 |
+
# Variable: AIRFLOW__WEBSERVER__NUM_RECENT_CONFIGURATIONS_FOR_TRIGGER
|
1839 |
+
#
|
1840 |
+
num_recent_configurations_for_trigger = 5
|
1841 |
+
|
1842 |
+
# A DAG author is able to provide any raw HTML into ``doc_md`` or params description in
|
1843 |
+
# ``description_md`` for text formatting. This is including potentially unsafe javascript.
|
1844 |
+
# Displaying the DAG or trigger form in web UI provides the DAG author the potential to
|
1845 |
+
# inject malicious code into clients browsers. To ensure the web UI is safe by default,
|
1846 |
+
# raw HTML is disabled by default. If you trust your DAG authors, you can enable HTML
|
1847 |
+
# support in markdown by setting this option to ``True``.
|
1848 |
+
#
|
1849 |
+
# This parameter also enables the deprecated fields ``description_html`` and
|
1850 |
+
# ``custom_html_form`` in DAG params until the feature is removed in a future version.
|
1851 |
+
#
|
1852 |
+
# Example: allow_raw_html_descriptions = False
|
1853 |
+
#
|
1854 |
+
# Variable: AIRFLOW__WEBSERVER__ALLOW_RAW_HTML_DESCRIPTIONS
|
1855 |
+
#
|
1856 |
+
allow_raw_html_descriptions = False
|
1857 |
+
|
1858 |
+
# The maximum size of the request payload (in MB) that can be sent.
|
1859 |
+
#
|
1860 |
+
# Variable: AIRFLOW__WEBSERVER__ALLOWED_PAYLOAD_SIZE
|
1861 |
+
#
|
1862 |
+
allowed_payload_size = 1.0
|
1863 |
+
|
1864 |
+
# Require confirmation when changing a DAG in the web UI. This is to prevent accidental changes
|
1865 |
+
# to a DAG that may be running on sensitive environments like production.
|
1866 |
+
# When set to ``True``, confirmation dialog will be shown when a user tries to Pause/Unpause,
|
1867 |
+
# Trigger a DAG
|
1868 |
+
#
|
1869 |
+
# Variable: AIRFLOW__WEBSERVER__REQUIRE_CONFIRMATION_DAG_CHANGE
|
1870 |
+
#
|
1871 |
+
require_confirmation_dag_change = False
|
1872 |
+
|
1873 |
+
[email]
|
1874 |
+
# Configuration email backend and whether to
|
1875 |
+
# send email alerts on retry or failure
|
1876 |
+
|
1877 |
+
# Email backend to use
|
1878 |
+
#
|
1879 |
+
# Variable: AIRFLOW__EMAIL__EMAIL_BACKEND
|
1880 |
+
#
|
1881 |
+
email_backend = airflow.utils.email.send_email_smtp
|
1882 |
+
|
1883 |
+
# Email connection to use
|
1884 |
+
#
|
1885 |
+
# Variable: AIRFLOW__EMAIL__EMAIL_CONN_ID
|
1886 |
+
#
|
1887 |
+
email_conn_id = smtp_default
|
1888 |
+
|
1889 |
+
# Whether email alerts should be sent when a task is retried
|
1890 |
+
#
|
1891 |
+
# Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_RETRY
|
1892 |
+
#
|
1893 |
+
default_email_on_retry = True
|
1894 |
+
|
1895 |
+
# Whether email alerts should be sent when a task failed
|
1896 |
+
#
|
1897 |
+
# Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_FAILURE
|
1898 |
+
#
|
1899 |
+
default_email_on_failure = True
|
1900 |
+
|
1901 |
+
# File that will be used as the template for Email subject (which will be rendered using Jinja2).
|
1902 |
+
# If not set, Airflow uses a base template.
|
1903 |
+
#
|
1904 |
+
# Example: subject_template = /path/to/my_subject_template_file
|
1905 |
+
#
|
1906 |
+
# Variable: AIRFLOW__EMAIL__SUBJECT_TEMPLATE
|
1907 |
+
#
|
1908 |
+
# subject_template =
|
1909 |
+
|
1910 |
+
# File that will be used as the template for Email content (which will be rendered using Jinja2).
|
1911 |
+
# If not set, Airflow uses a base template.
|
1912 |
+
#
|
1913 |
+
# Example: html_content_template = /path/to/my_html_content_template_file
|
1914 |
+
#
|
1915 |
+
# Variable: AIRFLOW__EMAIL__HTML_CONTENT_TEMPLATE
|
1916 |
+
#
|
1917 |
+
# html_content_template =
|
1918 |
+
|
1919 |
+
# Email address that will be used as sender address.
|
1920 |
+
# It can either be raw email or the complete address in a format ``Sender Name <[email protected]>``
|
1921 |
+
#
|
1922 |
+
# Example: from_email = Airflow <[email protected]>
|
1923 |
+
#
|
1924 |
+
# Variable: AIRFLOW__EMAIL__FROM_EMAIL
|
1925 |
+
#
|
1926 |
+
# from_email =
|
1927 |
+
|
1928 |
+
# ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default"
|
1929 |
+
# which sets it to ``ssl.create_default_context()`` which provides the right balance between
|
1930 |
+
# compatibility and security, it however requires that certificates in your operating system are
|
1931 |
+
# updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public
|
1932 |
+
# keys installed on your machines. You can switch it to "none" if you want to disable checking
|
1933 |
+
# of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks
|
1934 |
+
# if your infrastructure is not sufficiently secured. It should only be set temporarily while you
|
1935 |
+
# are fixing your certificate configuration. This can be typically done by upgrading to newer
|
1936 |
+
# version of the operating system you run Airflow components on,by upgrading/refreshing proper
|
1937 |
+
# certificates in the OS or by updating certificates for your mail servers.
|
1938 |
+
#
|
1939 |
+
# Example: ssl_context = default
|
1940 |
+
#
|
1941 |
+
# Variable: AIRFLOW__EMAIL__SSL_CONTEXT
|
1942 |
+
#
|
1943 |
+
ssl_context = default
|
1944 |
+
|
1945 |
+
[smtp]
|
1946 |
+
# If you want airflow to send emails on retries, failure, and you want to use
|
1947 |
+
# the airflow.utils.email.send_email_smtp function, you have to configure an
|
1948 |
+
# smtp server here
|
1949 |
+
|
1950 |
+
# Specifies the host server address used by Airflow when sending out email notifications via SMTP.
|
1951 |
+
#
|
1952 |
+
# Variable: AIRFLOW__SMTP__SMTP_HOST
|
1953 |
+
#
|
1954 |
+
smtp_host = localhost
|
1955 |
+
|
1956 |
+
# Determines whether to use the STARTTLS command when connecting to the SMTP server.
|
1957 |
+
#
|
1958 |
+
# Variable: AIRFLOW__SMTP__SMTP_STARTTLS
|
1959 |
+
#
|
1960 |
+
smtp_starttls = True
|
1961 |
+
|
1962 |
+
# Determines whether to use an SSL connection when talking to the SMTP server.
|
1963 |
+
#
|
1964 |
+
# Variable: AIRFLOW__SMTP__SMTP_SSL
|
1965 |
+
#
|
1966 |
+
smtp_ssl = False
|
1967 |
+
|
1968 |
+
# Username to authenticate when connecting to smtp server.
|
1969 |
+
#
|
1970 |
+
# Example: smtp_user = airflow
|
1971 |
+
#
|
1972 |
+
# Variable: AIRFLOW__SMTP__SMTP_USER
|
1973 |
+
#
|
1974 |
+
# smtp_user =
|
1975 |
+
|
1976 |
+
# Password to authenticate when connecting to smtp server.
|
1977 |
+
#
|
1978 |
+
# Example: smtp_password = airflow
|
1979 |
+
#
|
1980 |
+
# Variable: AIRFLOW__SMTP__SMTP_PASSWORD
|
1981 |
+
#
|
1982 |
+
# smtp_password =
|
1983 |
+
|
1984 |
+
# Defines the port number on which Airflow connects to the SMTP server to send email notifications.
|
1985 |
+
#
|
1986 |
+
# Variable: AIRFLOW__SMTP__SMTP_PORT
|
1987 |
+
#
|
1988 |
+
smtp_port = 25
|
1989 |
+
|
1990 |
+
# Specifies the default **from** email address used when Airflow sends email notifications.
|
1991 |
+
#
|
1992 |
+
# Variable: AIRFLOW__SMTP__SMTP_MAIL_FROM
|
1993 |
+
#
|
1994 |
+
smtp_mail_from = [email protected]
|
1995 |
+
|
1996 |
+
# Determines the maximum time (in seconds) the Apache Airflow system will wait for a
|
1997 |
+
# connection to the SMTP server to be established.
|
1998 |
+
#
|
1999 |
+
# Variable: AIRFLOW__SMTP__SMTP_TIMEOUT
|
2000 |
+
#
|
2001 |
+
smtp_timeout = 30
|
2002 |
+
|
2003 |
+
# Defines the maximum number of times Airflow will attempt to connect to the SMTP server.
|
2004 |
+
#
|
2005 |
+
# Variable: AIRFLOW__SMTP__SMTP_RETRY_LIMIT
|
2006 |
+
#
|
2007 |
+
smtp_retry_limit = 5
|
2008 |
+
|
2009 |
+
[sentry]
|
2010 |
+
# `Sentry <https://docs.sentry.io>`__ integration. Here you can supply
|
2011 |
+
# additional configuration options based on the Python platform.
|
2012 |
+
# See `Python / Configuration / Basic Options
|
2013 |
+
# <https://docs.sentry.io/platforms/python/configuration/options/>`__ for more details.
|
2014 |
+
# Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
|
2015 |
+
# ``ignore_errors``, ``before_breadcrumb``, ``transport``.
|
2016 |
+
|
2017 |
+
# Enable error reporting to Sentry
|
2018 |
+
#
|
2019 |
+
# Variable: AIRFLOW__SENTRY__SENTRY_ON
|
2020 |
+
#
|
2021 |
+
sentry_on = false
|
2022 |
+
|
2023 |
+
#
|
2024 |
+
# Variable: AIRFLOW__SENTRY__SENTRY_DSN
|
2025 |
+
#
|
2026 |
+
sentry_dsn =
|
2027 |
+
|
2028 |
+
# Dotted path to a before_send function that the sentry SDK should be configured to use.
|
2029 |
+
#
|
2030 |
+
# Variable: AIRFLOW__SENTRY__BEFORE_SEND
|
2031 |
+
#
|
2032 |
+
# before_send =
|
2033 |
+
|
2034 |
+
[scheduler]
|
2035 |
+
# Task instances listen for external kill signal (when you clear tasks
|
2036 |
+
# from the CLI or the UI), this defines the frequency at which they should
|
2037 |
+
# listen (in seconds).
|
2038 |
+
#
|
2039 |
+
# Variable: AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC
|
2040 |
+
#
|
2041 |
+
job_heartbeat_sec = 5
|
2042 |
+
|
2043 |
+
# The scheduler constantly tries to trigger new tasks (look at the
|
2044 |
+
# scheduler section in the docs for more information). This defines
|
2045 |
+
# how often the scheduler should run (in seconds).
|
2046 |
+
#
|
2047 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC
|
2048 |
+
#
|
2049 |
+
scheduler_heartbeat_sec = 5
|
2050 |
+
|
2051 |
+
# The frequency (in seconds) at which the LocalTaskJob should send heartbeat signals to the
|
2052 |
+
# scheduler to notify it's still alive. If this value is set to 0, the heartbeat interval will default
|
2053 |
+
# to the value of ``[scheduler] scheduler_zombie_task_threshold``.
|
2054 |
+
#
|
2055 |
+
# Variable: AIRFLOW__SCHEDULER__LOCAL_TASK_JOB_HEARTBEAT_SEC
|
2056 |
+
#
|
2057 |
+
local_task_job_heartbeat_sec = 0
|
2058 |
+
|
2059 |
+
# The number of times to try to schedule each DAG file
|
2060 |
+
# -1 indicates unlimited number
|
2061 |
+
#
|
2062 |
+
# Variable: AIRFLOW__SCHEDULER__NUM_RUNS
|
2063 |
+
#
|
2064 |
+
num_runs = -1
|
2065 |
+
|
2066 |
+
# Controls how long the scheduler will sleep between loops, but if there was nothing to do
|
2067 |
+
# in the loop. i.e. if it scheduled something then it will start the next loop
|
2068 |
+
# iteration straight away.
|
2069 |
+
#
|
2070 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_IDLE_SLEEP_TIME
|
2071 |
+
#
|
2072 |
+
scheduler_idle_sleep_time = 1
|
2073 |
+
|
2074 |
+
# Number of seconds after which a DAG file is parsed. The DAG file is parsed every
|
2075 |
+
# ``[scheduler] min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
|
2076 |
+
# this interval. Keeping this number low will increase CPU usage.
|
2077 |
+
#
|
2078 |
+
# Variable: AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL
|
2079 |
+
#
|
2080 |
+
min_file_process_interval = 30
|
2081 |
+
|
2082 |
+
# How often (in seconds) to check for stale DAGs (DAGs which are no longer present in
|
2083 |
+
# the expected files) which should be deactivated, as well as datasets that are no longer
|
2084 |
+
# referenced and should be marked as orphaned.
|
2085 |
+
#
|
2086 |
+
# Variable: AIRFLOW__SCHEDULER__PARSING_CLEANUP_INTERVAL
|
2087 |
+
#
|
2088 |
+
parsing_cleanup_interval = 60
|
2089 |
+
|
2090 |
+
# How long (in seconds) to wait after we have re-parsed a DAG file before deactivating stale
|
2091 |
+
# DAGs (DAGs which are no longer present in the expected files). The reason why we need
|
2092 |
+
# this threshold is to account for the time between when the file is parsed and when the
|
2093 |
+
# DAG is loaded. The absolute maximum that this could take is ``[core] dag_file_processor_timeout``,
|
2094 |
+
# but when you have a long timeout configured, it results in a significant delay in the
|
2095 |
+
# deactivation of stale dags.
|
2096 |
+
#
|
2097 |
+
# Variable: AIRFLOW__SCHEDULER__STALE_DAG_THRESHOLD
|
2098 |
+
#
|
2099 |
+
stale_dag_threshold = 50
|
2100 |
+
|
2101 |
+
# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
|
2102 |
+
#
|
2103 |
+
# Variable: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL
|
2104 |
+
#
|
2105 |
+
dag_dir_list_interval = 300
|
2106 |
+
|
2107 |
+
# How often should stats be printed to the logs. Setting to 0 will disable printing stats
|
2108 |
+
#
|
2109 |
+
# Variable: AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL
|
2110 |
+
#
|
2111 |
+
print_stats_interval = 30
|
2112 |
+
|
2113 |
+
# How often (in seconds) should pool usage stats be sent to StatsD (if statsd_on is enabled)
|
2114 |
+
#
|
2115 |
+
# Variable: AIRFLOW__SCHEDULER__POOL_METRICS_INTERVAL
|
2116 |
+
#
|
2117 |
+
pool_metrics_interval = 5.0
|
2118 |
+
|
2119 |
+
# If the last scheduler heartbeat happened more than ``[scheduler] scheduler_health_check_threshold``
|
2120 |
+
# ago (in seconds), scheduler is considered unhealthy.
|
2121 |
+
# This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
|
2122 |
+
# for SchedulerJob.
|
2123 |
+
#
|
2124 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_THRESHOLD
|
2125 |
+
#
|
2126 |
+
scheduler_health_check_threshold = 30
|
2127 |
+
|
2128 |
+
# When you start a scheduler, airflow starts a tiny web server
|
2129 |
+
# subprocess to serve a health check if this is set to ``True``
|
2130 |
+
#
|
2131 |
+
# Variable: AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK
|
2132 |
+
#
|
2133 |
+
enable_health_check = False
|
2134 |
+
|
2135 |
+
# When you start a scheduler, airflow starts a tiny web server
|
2136 |
+
# subprocess to serve a health check on this host
|
2137 |
+
#
|
2138 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_HOST
|
2139 |
+
#
|
2140 |
+
scheduler_health_check_server_host = 0.0.0.0
|
2141 |
+
|
2142 |
+
# When you start a scheduler, airflow starts a tiny web server
|
2143 |
+
# subprocess to serve a health check on this port
|
2144 |
+
#
|
2145 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_PORT
|
2146 |
+
#
|
2147 |
+
scheduler_health_check_server_port = 8974
|
2148 |
+
|
2149 |
+
# How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
|
2150 |
+
#
|
2151 |
+
# Variable: AIRFLOW__SCHEDULER__ORPHANED_TASKS_CHECK_INTERVAL
|
2152 |
+
#
|
2153 |
+
orphaned_tasks_check_interval = 300.0
|
2154 |
+
|
2155 |
+
# Determines the directory where logs for the child processes of the scheduler will be stored
|
2156 |
+
#
|
2157 |
+
# Variable: AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY
|
2158 |
+
#
|
2159 |
+
child_process_log_directory = /opt/airflow/logs/scheduler
|
2160 |
+
|
2161 |
+
# Local task jobs periodically heartbeat to the DB. If the job has
|
2162 |
+
# not heartbeat in this many seconds, the scheduler will mark the
|
2163 |
+
# associated task instance as failed and will re-schedule the task.
|
2164 |
+
#
|
2165 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULER_ZOMBIE_TASK_THRESHOLD
|
2166 |
+
#
|
2167 |
+
scheduler_zombie_task_threshold = 300
|
2168 |
+
|
2169 |
+
# How often (in seconds) should the scheduler check for zombie tasks.
|
2170 |
+
#
|
2171 |
+
# Variable: AIRFLOW__SCHEDULER__ZOMBIE_DETECTION_INTERVAL
|
2172 |
+
#
|
2173 |
+
zombie_detection_interval = 10.0
|
2174 |
+
|
2175 |
+
# Turn off scheduler catchup by setting this to ``False``.
|
2176 |
+
# Default behavior is unchanged and
|
2177 |
+
# Command Line Backfills still work, but the scheduler
|
2178 |
+
# will not do scheduler catchup if this is ``False``,
|
2179 |
+
# however it can be set on a per DAG basis in the
|
2180 |
+
# DAG definition (catchup)
|
2181 |
+
#
|
2182 |
+
# Variable: AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT
|
2183 |
+
#
|
2184 |
+
catchup_by_default = True
|
2185 |
+
|
2186 |
+
# Setting this to ``True`` will make first task instance of a task
|
2187 |
+
# ignore depends_on_past setting. A task instance will be considered
|
2188 |
+
# as the first task instance of a task when there is no task instance
|
2189 |
+
# in the DB with an execution_date earlier than it., i.e. no manual marking
|
2190 |
+
# success will be needed for a newly added task to be scheduled.
|
2191 |
+
#
|
2192 |
+
# Variable: AIRFLOW__SCHEDULER__IGNORE_FIRST_DEPENDS_ON_PAST_BY_DEFAULT
|
2193 |
+
#
|
2194 |
+
ignore_first_depends_on_past_by_default = True
|
2195 |
+
|
2196 |
+
# This changes the batch size of queries in the scheduling main loop.
|
2197 |
+
# This should not be greater than ``[core] parallelism``.
|
2198 |
+
# If this is too high, SQL query performance may be impacted by
|
2199 |
+
# complexity of query predicate, and/or excessive locking.
|
2200 |
+
# Additionally, you may hit the maximum allowable query length for your db.
|
2201 |
+
# Set this to 0 to use the value of ``[core] parallelism``
|
2202 |
+
#
|
2203 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY
|
2204 |
+
#
|
2205 |
+
max_tis_per_query = 16
|
2206 |
+
|
2207 |
+
# Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
|
2208 |
+
# If this is set to ``False`` then you should not run more than a single
|
2209 |
+
# scheduler at once
|
2210 |
+
#
|
2211 |
+
# Variable: AIRFLOW__SCHEDULER__USE_ROW_LEVEL_LOCKING
|
2212 |
+
#
|
2213 |
+
use_row_level_locking = True
|
2214 |
+
|
2215 |
+
# Max number of DAGs to create DagRuns for per scheduler loop.
|
2216 |
+
#
|
2217 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_TO_CREATE_PER_LOOP
|
2218 |
+
#
|
2219 |
+
max_dagruns_to_create_per_loop = 10
|
2220 |
+
|
2221 |
+
# How many DagRuns should a scheduler examine (and lock) when scheduling
|
2222 |
+
# and queuing tasks.
|
2223 |
+
#
|
2224 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_PER_LOOP_TO_SCHEDULE
|
2225 |
+
#
|
2226 |
+
max_dagruns_per_loop_to_schedule = 20
|
2227 |
+
|
2228 |
+
# Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
|
2229 |
+
# same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
|
2230 |
+
# dags in some circumstances
|
2231 |
+
#
|
2232 |
+
# Variable: AIRFLOW__SCHEDULER__SCHEDULE_AFTER_TASK_EXECUTION
|
2233 |
+
#
|
2234 |
+
schedule_after_task_execution = True
|
2235 |
+
|
2236 |
+
# The scheduler reads dag files to extract the airflow modules that are going to be used,
|
2237 |
+
# and imports them ahead of time to avoid having to re-do it for each parsing process.
|
2238 |
+
# This flag can be set to ``False`` to disable this behavior in case an airflow module needs
|
2239 |
+
# to be freshly imported each time (at the cost of increased DAG parsing time).
|
2240 |
+
#
|
2241 |
+
# Variable: AIRFLOW__SCHEDULER__PARSING_PRE_IMPORT_MODULES
|
2242 |
+
#
|
2243 |
+
parsing_pre_import_modules = True
|
2244 |
+
|
2245 |
+
# The scheduler can run multiple processes in parallel to parse dags.
|
2246 |
+
# This defines how many processes will run.
|
2247 |
+
#
|
2248 |
+
# Variable: AIRFLOW__SCHEDULER__PARSING_PROCESSES
|
2249 |
+
#
|
2250 |
+
parsing_processes = 2
|
2251 |
+
|
2252 |
+
# One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``.
|
2253 |
+
# The scheduler will list and sort the dag files to decide the parsing order.
|
2254 |
+
#
|
2255 |
+
# * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the
|
2256 |
+
# recently modified DAGs first.
|
2257 |
+
# * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the
|
2258 |
+
# same host. This is useful when running with Scheduler in HA mode where each scheduler can
|
2259 |
+
# parse different DAG files.
|
2260 |
+
# * ``alphabetical``: Sort by filename
|
2261 |
+
#
|
2262 |
+
# Variable: AIRFLOW__SCHEDULER__FILE_PARSING_SORT_MODE
|
2263 |
+
#
|
2264 |
+
file_parsing_sort_mode = modified_time
|
2265 |
+
|
2266 |
+
# Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler
|
2267 |
+
# job.
|
2268 |
+
#
|
2269 |
+
# Variable: AIRFLOW__SCHEDULER__STANDALONE_DAG_PROCESSOR
|
2270 |
+
#
|
2271 |
+
standalone_dag_processor = False
|
2272 |
+
|
2273 |
+
# Only applicable if ``[scheduler] standalone_dag_processor`` is true and callbacks are stored
|
2274 |
+
# in database. Contains maximum number of callbacks that are fetched during a single loop.
|
2275 |
+
#
|
2276 |
+
# Variable: AIRFLOW__SCHEDULER__MAX_CALLBACKS_PER_LOOP
|
2277 |
+
#
|
2278 |
+
max_callbacks_per_loop = 20
|
2279 |
+
|
2280 |
+
# Only applicable if ``[scheduler] standalone_dag_processor`` is true.
|
2281 |
+
# Time in seconds after which dags, which were not updated by Dag Processor are deactivated.
|
2282 |
+
#
|
2283 |
+
# Variable: AIRFLOW__SCHEDULER__DAG_STALE_NOT_SEEN_DURATION
|
2284 |
+
#
|
2285 |
+
dag_stale_not_seen_duration = 600
|
2286 |
+
|
2287 |
+
# Turn off scheduler use of cron intervals by setting this to ``False``.
|
2288 |
+
# DAGs submitted manually in the web UI or with trigger_dag will still run.
|
2289 |
+
#
|
2290 |
+
# Variable: AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE
|
2291 |
+
#
|
2292 |
+
use_job_schedule = True
|
2293 |
+
|
2294 |
+
# Allow externally triggered DagRuns for Execution Dates in the future
|
2295 |
+
# Only has effect if schedule_interval is set to None in DAG
|
2296 |
+
#
|
2297 |
+
# Variable: AIRFLOW__SCHEDULER__ALLOW_TRIGGER_IN_FUTURE
|
2298 |
+
#
|
2299 |
+
allow_trigger_in_future = False
|
2300 |
+
|
2301 |
+
# How often to check for expired trigger requests that have not run yet.
|
2302 |
+
#
|
2303 |
+
# Variable: AIRFLOW__SCHEDULER__TRIGGER_TIMEOUT_CHECK_INTERVAL
|
2304 |
+
#
|
2305 |
+
trigger_timeout_check_interval = 15
|
2306 |
+
|
2307 |
+
# Amount of time a task can be in the queued state before being retried or set to failed.
|
2308 |
+
#
|
2309 |
+
# Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT
|
2310 |
+
#
|
2311 |
+
task_queued_timeout = 600.0
|
2312 |
+
|
2313 |
+
# How often to check for tasks that have been in the queued state for
|
2314 |
+
# longer than ``[scheduler] task_queued_timeout``.
|
2315 |
+
#
|
2316 |
+
# Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT_CHECK_INTERVAL
|
2317 |
+
#
|
2318 |
+
task_queued_timeout_check_interval = 120.0
|
2319 |
+
|
2320 |
+
# The run_id pattern used to verify the validity of user input to the run_id parameter when
|
2321 |
+
# triggering a DAG. This pattern cannot change the pattern used by scheduler to generate run_id
|
2322 |
+
# for scheduled DAG runs or DAG runs triggered without changing the run_id parameter.
|
2323 |
+
#
|
2324 |
+
# Variable: AIRFLOW__SCHEDULER__ALLOWED_RUN_ID_PATTERN
|
2325 |
+
#
|
2326 |
+
allowed_run_id_pattern = ^[A-Za-z0-9_.~:+-]+$
|
2327 |
+
|
2328 |
+
# Whether to create DAG runs that span an interval or one single point in time for cron schedules, when
|
2329 |
+
# a cron string is provided to ``schedule`` argument of a DAG.
|
2330 |
+
#
|
2331 |
+
# * ``True``: **CronDataIntervalTimetable** is used, which is suitable
|
2332 |
+
# for DAGs with well-defined data interval. You get contiguous intervals from the end of the previous
|
2333 |
+
# interval up to the scheduled datetime.
|
2334 |
+
# * ``False``: **CronTriggerTimetable** is used, which is closer to the behavior of cron itself.
|
2335 |
+
#
|
2336 |
+
# Notably, for **CronTriggerTimetable**, the logical date is the same as the time the DAG Run will
|
2337 |
+
# try to schedule, while for **CronDataIntervalTimetable**, the logical date is the beginning of
|
2338 |
+
# the data interval, but the DAG Run will try to schedule at the end of the data interval.
|
2339 |
+
#
|
2340 |
+
# Variable: AIRFLOW__SCHEDULER__CREATE_CRON_DATA_INTERVALS
|
2341 |
+
#
|
2342 |
+
create_cron_data_intervals = True
|
2343 |
+
|
2344 |
+
[triggerer]
|
2345 |
+
# How many triggers a single Triggerer will run at once, by default.
|
2346 |
+
#
|
2347 |
+
# Variable: AIRFLOW__TRIGGERER__DEFAULT_CAPACITY
|
2348 |
+
#
|
2349 |
+
default_capacity = 1000
|
2350 |
+
|
2351 |
+
# How often to heartbeat the Triggerer job to ensure it hasn't been killed.
|
2352 |
+
#
|
2353 |
+
# Variable: AIRFLOW__TRIGGERER__JOB_HEARTBEAT_SEC
|
2354 |
+
#
|
2355 |
+
job_heartbeat_sec = 5
|
2356 |
+
|
2357 |
+
# If the last triggerer heartbeat happened more than ``[triggerer] triggerer_health_check_threshold``
|
2358 |
+
# ago (in seconds), triggerer is considered unhealthy.
|
2359 |
+
# This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
|
2360 |
+
# for TriggererJob.
|
2361 |
+
#
|
2362 |
+
# Variable: AIRFLOW__TRIGGERER__TRIGGERER_HEALTH_CHECK_THRESHOLD
|
2363 |
+
#
|
2364 |
+
triggerer_health_check_threshold = 30
|
2365 |
+
|
2366 |
+
[kerberos]
|
2367 |
+
# Location of your ccache file once kinit has been performed.
|
2368 |
+
#
|
2369 |
+
# Variable: AIRFLOW__KERBEROS__CCACHE
|
2370 |
+
#
|
2371 |
+
ccache = /tmp/airflow_krb5_ccache
|
2372 |
+
|
2373 |
+
# gets augmented with fqdn
|
2374 |
+
#
|
2375 |
+
# Variable: AIRFLOW__KERBEROS__PRINCIPAL
|
2376 |
+
#
|
2377 |
+
principal = airflow
|
2378 |
+
|
2379 |
+
# Determines the frequency at which initialization or re-initialization processes occur.
|
2380 |
+
#
|
2381 |
+
# Variable: AIRFLOW__KERBEROS__REINIT_FREQUENCY
|
2382 |
+
#
|
2383 |
+
reinit_frequency = 3600
|
2384 |
+
|
2385 |
+
# Path to the kinit executable
|
2386 |
+
#
|
2387 |
+
# Variable: AIRFLOW__KERBEROS__KINIT_PATH
|
2388 |
+
#
|
2389 |
+
kinit_path = kinit
|
2390 |
+
|
2391 |
+
# Designates the path to the Kerberos keytab file for the Airflow user
|
2392 |
+
#
|
2393 |
+
# Variable: AIRFLOW__KERBEROS__KEYTAB
|
2394 |
+
#
|
2395 |
+
keytab = airflow.keytab
|
2396 |
+
|
2397 |
+
# Allow to disable ticket forwardability.
|
2398 |
+
#
|
2399 |
+
# Variable: AIRFLOW__KERBEROS__FORWARDABLE
|
2400 |
+
#
|
2401 |
+
forwardable = True
|
2402 |
+
|
2403 |
+
# Allow to remove source IP from token, useful when using token behind NATted Docker host.
|
2404 |
+
#
|
2405 |
+
# Variable: AIRFLOW__KERBEROS__INCLUDE_IP
|
2406 |
+
#
|
2407 |
+
include_ip = True
|
2408 |
+
|
2409 |
+
[sensors]
|
2410 |
+
# Sensor default timeout, 7 days by default (7 * 24 * 60 * 60).
|
2411 |
+
#
|
2412 |
+
# Variable: AIRFLOW__SENSORS__DEFAULT_TIMEOUT
|
2413 |
+
#
|
2414 |
+
default_timeout = 604800
|
2415 |
+
|
2416 |
+
[usage_data_collection]
|
2417 |
+
# Airflow integrates `Scarf <https://about.scarf.sh/>`__ to collect basic platform and usage data
|
2418 |
+
# during operation. This data assists Airflow maintainers in better understanding how Airflow is used.
|
2419 |
+
# Insights gained from this telemetry are critical for prioritizing patches, minor releases, and
|
2420 |
+
# security fixes. Additionally, this information supports key decisions related to the development road map.
|
2421 |
+
# Check the FAQ doc for more information on what data is collected.
|
2422 |
+
#
|
2423 |
+
# Deployments can opt-out of analytics by setting the ``enabled`` option
|
2424 |
+
# to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable.
|
2425 |
+
# Individual users can easily opt-out of analytics in various ways documented in the
|
2426 |
+
# `Scarf Do Not Track docs <https://docs.scarf.sh/gateway/#do-not-track>`__.
|
2427 |
+
|
2428 |
+
# Enable or disable usage data collection and sending.
|
2429 |
+
#
|
2430 |
+
# Variable: AIRFLOW__USAGE_DATA_COLLECTION__ENABLED
|
2431 |
+
#
|
2432 |
+
enabled = True
|
2433 |
+
|
2434 |
+
[celery_kubernetes_executor]
|
2435 |
+
# This section only applies if you are using the ``CeleryKubernetesExecutor`` in
|
2436 |
+
# ``[core]`` section above
|
2437 |
+
|
2438 |
+
# Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``.
|
2439 |
+
# When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``),
|
2440 |
+
# the task is executed via ``KubernetesExecutor``,
|
2441 |
+
# otherwise via ``CeleryExecutor``
|
2442 |
+
#
|
2443 |
+
# Variable: AIRFLOW__CELERY_KUBERNETES_EXECUTOR__KUBERNETES_QUEUE
|
2444 |
+
#
|
2445 |
+
kubernetes_queue = kubernetes
|
2446 |
+
|
2447 |
+
[celery]
|
2448 |
+
# This section only applies if you are using the CeleryExecutor in
|
2449 |
+
# ``[core]`` section above
|
2450 |
+
|
2451 |
+
# The app name that will be used by celery
|
2452 |
+
#
|
2453 |
+
# Variable: AIRFLOW__CELERY__CELERY_APP_NAME
|
2454 |
+
#
|
2455 |
+
celery_app_name = airflow.providers.celery.executors.celery_executor
|
2456 |
+
|
2457 |
+
# The concurrency that will be used when starting workers with the
|
2458 |
+
# ``airflow celery worker`` command. This defines the number of task instances that
|
2459 |
+
# a worker will take, so size up your workers based on the resources on
|
2460 |
+
# your worker box and the nature of your tasks
|
2461 |
+
#
|
2462 |
+
# Variable: AIRFLOW__CELERY__WORKER_CONCURRENCY
|
2463 |
+
#
|
2464 |
+
worker_concurrency = 16
|
2465 |
+
|
2466 |
+
# The maximum and minimum number of pool processes that will be used to dynamically resize
|
2467 |
+
# the pool based on load.Enable autoscaling by providing max_concurrency,min_concurrency
|
2468 |
+
# with the ``airflow celery worker`` command (always keep minimum processes,
|
2469 |
+
# but grow to maximum if necessary).
|
2470 |
+
# Pick these numbers based on resources on worker box and the nature of the task.
|
2471 |
+
# If autoscale option is available, worker_concurrency will be ignored.
|
2472 |
+
# https://docs.celeryq.dev/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
|
2473 |
+
#
|
2474 |
+
# Example: worker_autoscale = 16,12
|
2475 |
+
#
|
2476 |
+
# Variable: AIRFLOW__CELERY__WORKER_AUTOSCALE
|
2477 |
+
#
|
2478 |
+
# worker_autoscale =
|
2479 |
+
|
2480 |
+
# Used to increase the number of tasks that a worker prefetches which can improve performance.
|
2481 |
+
# The number of processes multiplied by worker_prefetch_multiplier is the number of tasks
|
2482 |
+
# that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily
|
2483 |
+
# blocked if there are multiple workers and one worker prefetches tasks that sit behind long
|
2484 |
+
# running tasks while another worker has unutilized processes that are unable to process the already
|
2485 |
+
# claimed blocked tasks.
|
2486 |
+
# https://docs.celeryq.dev/en/stable/userguide/optimizing.html#prefetch-limits
|
2487 |
+
#
|
2488 |
+
# Variable: AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER
|
2489 |
+
#
|
2490 |
+
worker_prefetch_multiplier = 1
|
2491 |
+
|
2492 |
+
# Specify if remote control of the workers is enabled.
|
2493 |
+
# In some cases when the broker does not support remote control, Celery creates lots of
|
2494 |
+
# ``.*reply-celery-pidbox`` queues. You can prevent this by setting this to false.
|
2495 |
+
# However, with this disabled Flower won't work.
|
2496 |
+
# https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/index.html#broker-overview
|
2497 |
+
#
|
2498 |
+
# Variable: AIRFLOW__CELERY__WORKER_ENABLE_REMOTE_CONTROL
|
2499 |
+
#
|
2500 |
+
worker_enable_remote_control = true
|
2501 |
+
|
2502 |
+
# The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
|
2503 |
+
# a sqlalchemy database. Refer to the Celery documentation for more information.
|
2504 |
+
#
|
2505 |
+
# Variable: AIRFLOW__CELERY__BROKER_URL
|
2506 |
+
#
|
2507 |
+
broker_url = ${AIRFLOW__CELERY__BROKER_URL}
|
2508 |
+
|
2509 |
+
# The Celery result_backend. When a job finishes, it needs to update the
|
2510 |
+
# metadata of the job. Therefore it will post a message on a message bus,
|
2511 |
+
# or insert it into a database (depending of the backend)
|
2512 |
+
# This status is used by the scheduler to update the state of the task
|
2513 |
+
# The use of a database is highly recommended
|
2514 |
+
# When not specified, sql_alchemy_conn with a db+ scheme prefix will be used
|
2515 |
+
# https://docs.celeryq.dev/en/latest/userguide/configuration.html#task-result-backend-settings
|
2516 |
+
#
|
2517 |
+
# Example: result_backend = db+postgresql://postgres:airflow@postgres/airflow
|
2518 |
+
#
|
2519 |
+
# Variable: AIRFLOW__CELERY__RESULT_BACKEND
|
2520 |
+
#
|
2521 |
+
result_backend = ${AIRFLOW__DATABASE__SQL_ALCHEMY_CONN}
|
2522 |
+
|
2523 |
+
# Optional configuration dictionary to pass to the Celery result backend SQLAlchemy engine.
|
2524 |
+
#
|
2525 |
+
# Example: result_backend_sqlalchemy_engine_options = {"pool_recycle": 1800}
|
2526 |
+
#
|
2527 |
+
# Variable: AIRFLOW__CELERY__RESULT_BACKEND_SQLALCHEMY_ENGINE_OPTIONS
|
2528 |
+
#
|
2529 |
+
result_backend_sqlalchemy_engine_options =
|
2530 |
+
|
2531 |
+
# Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
|
2532 |
+
# it ``airflow celery flower``. This defines the IP that Celery Flower runs on
|
2533 |
+
#
|
2534 |
+
# Variable: AIRFLOW__CELERY__FLOWER_HOST
|
2535 |
+
#
|
2536 |
+
flower_host = 0.0.0.0
|
2537 |
+
|
2538 |
+
# The root URL for Flower
|
2539 |
+
#
|
2540 |
+
# Example: flower_url_prefix = /flower
|
2541 |
+
#
|
2542 |
+
# Variable: AIRFLOW__CELERY__FLOWER_URL_PREFIX
|
2543 |
+
#
|
2544 |
+
flower_url_prefix =
|
2545 |
+
|
2546 |
+
# This defines the port that Celery Flower runs on
|
2547 |
+
#
|
2548 |
+
# Variable: AIRFLOW__CELERY__FLOWER_PORT
|
2549 |
+
#
|
2550 |
+
flower_port = 5555
|
2551 |
+
|
2552 |
+
# Securing Flower with Basic Authentication
|
2553 |
+
# Accepts user:password pairs separated by a comma
|
2554 |
+
#
|
2555 |
+
# Example: flower_basic_auth = user1:password1,user2:password2
|
2556 |
+
#
|
2557 |
+
# Variable: AIRFLOW__CELERY__FLOWER_BASIC_AUTH
|
2558 |
+
#
|
2559 |
+
flower_basic_auth =
|
2560 |
+
|
2561 |
+
# How many processes CeleryExecutor uses to sync task state.
|
2562 |
+
# 0 means to use max(1, number of cores - 1) processes.
|
2563 |
+
#
|
2564 |
+
# Variable: AIRFLOW__CELERY__SYNC_PARALLELISM
|
2565 |
+
#
|
2566 |
+
sync_parallelism = 0
|
2567 |
+
|
2568 |
+
# Import path for celery configuration options
|
2569 |
+
#
|
2570 |
+
# Variable: AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS
|
2571 |
+
#
|
2572 |
+
celery_config_options = airflow.providers.celery.executors.default_celery.DEFAULT_CELERY_CONFIG
|
2573 |
+
|
2574 |
+
#
|
2575 |
+
# Variable: AIRFLOW__CELERY__SSL_ACTIVE
|
2576 |
+
#
|
2577 |
+
ssl_active = False
|
2578 |
+
|
2579 |
+
# Path to the client key.
|
2580 |
+
#
|
2581 |
+
# Variable: AIRFLOW__CELERY__SSL_KEY
|
2582 |
+
#
|
2583 |
+
ssl_key =
|
2584 |
+
|
2585 |
+
# Path to the client certificate.
|
2586 |
+
#
|
2587 |
+
# Variable: AIRFLOW__CELERY__SSL_CERT
|
2588 |
+
#
|
2589 |
+
ssl_cert =
|
2590 |
+
|
2591 |
+
# Path to the CA certificate.
|
2592 |
+
#
|
2593 |
+
# Variable: AIRFLOW__CELERY__SSL_CACERT
|
2594 |
+
#
|
2595 |
+
ssl_cacert =
|
2596 |
+
|
2597 |
+
# Celery Pool implementation.
|
2598 |
+
# Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``.
|
2599 |
+
# See:
|
2600 |
+
# https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency
|
2601 |
+
# https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html
|
2602 |
+
#
|
2603 |
+
# Variable: AIRFLOW__CELERY__POOL
|
2604 |
+
#
|
2605 |
+
pool = prefork
|
2606 |
+
|
2607 |
+
# The number of seconds to wait before timing out ``send_task_to_executor`` or
|
2608 |
+
# ``fetch_celery_task_state`` operations.
|
2609 |
+
#
|
2610 |
+
# Variable: AIRFLOW__CELERY__OPERATION_TIMEOUT
|
2611 |
+
#
|
2612 |
+
operation_timeout = 1.0
|
2613 |
+
|
2614 |
+
task_acks_late = True
|
2615 |
+
# Celery task will report its status as 'started' when the task is executed by a worker.
|
2616 |
+
# This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted
|
2617 |
+
# or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob.
|
2618 |
+
#
|
2619 |
+
# Variable: AIRFLOW__CELERY__TASK_TRACK_STARTED
|
2620 |
+
#
|
2621 |
+
task_track_started = True
|
2622 |
+
|
2623 |
+
# The Maximum number of retries for publishing task messages to the broker when failing
|
2624 |
+
# due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed.
|
2625 |
+
#
|
2626 |
+
# Variable: AIRFLOW__CELERY__TASK_PUBLISH_MAX_RETRIES
|
2627 |
+
#
|
2628 |
+
task_publish_max_retries = 3
|
2629 |
+
|
2630 |
+
# Worker initialisation check to validate Metadata Database connection
|
2631 |
+
#
|
2632 |
+
# Variable: AIRFLOW__CELERY__WORKER_PRECHECK
|
2633 |
+
#
|
2634 |
+
worker_precheck = False
|
2635 |
+
|
2636 |
+
[celery_broker_transport_options]
|
2637 |
+
# This section is for specifying options which can be passed to the
|
2638 |
+
# underlying celery broker transport. See:
|
2639 |
+
# https://docs.celeryq.dev/en/latest/userguide/configuration.html#std:setting-broker_transport_options
|
2640 |
+
|
2641 |
+
# The visibility timeout defines the number of seconds to wait for the worker
|
2642 |
+
# to acknowledge the task before the message is redelivered to another worker.
|
2643 |
+
# Make sure to increase the visibility timeout to match the time of the longest
|
2644 |
+
# ETA you're planning to use.
|
2645 |
+
# visibility_timeout is only supported for Redis and SQS celery brokers.
|
2646 |
+
# See:
|
2647 |
+
# https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#visibility-timeout
|
2648 |
+
#
|
2649 |
+
# Example: visibility_timeout = 21600
|
2650 |
+
#
|
2651 |
+
# Variable: AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__VISIBILITY_TIMEOUT
|
2652 |
+
#
|
2653 |
+
# visibility_timeout =
|
2654 |
+
|
2655 |
+
# The sentinel_kwargs parameter allows passing additional options to the Sentinel client.
|
2656 |
+
# In a typical scenario where Redis Sentinel is used as the broker and Redis servers are
|
2657 |
+
# password-protected, the password needs to be passed through this parameter. Although its
|
2658 |
+
# type is string, it is required to pass a string that conforms to the dictionary format.
|
2659 |
+
# See:
|
2660 |
+
# https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#configuration
|
2661 |
+
#
|
2662 |
+
# Example: sentinel_kwargs = {"password": "password_for_redis_server"}
|
2663 |
+
#
|
2664 |
+
# Variable: AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__SENTINEL_KWARGS
|
2665 |
+
#
|
2666 |
+
# sentinel_kwargs =
|
2667 |
+
|
2668 |
+
[common.io]
|
2669 |
+
# Common IO configuration section
|
2670 |
+
|
2671 |
+
# Path to a location on object storage where XComs can be stored in url format.
|
2672 |
+
#
|
2673 |
+
# Example: xcom_objectstorage_path = s3://conn_id@bucket/path
|
2674 |
+
#
|
2675 |
+
# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_PATH
|
2676 |
+
#
|
2677 |
+
xcom_objectstorage_path =
|
2678 |
+
|
2679 |
+
# Threshold in bytes for storing XComs in object storage. -1 means always store in the
|
2680 |
+
# database. 0 means always store in object storage. Any positive number means
|
2681 |
+
# it will be stored in object storage if the size of the value is greater than the threshold.
|
2682 |
+
#
|
2683 |
+
# Example: xcom_objectstorage_threshold = 1000000
|
2684 |
+
#
|
2685 |
+
# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_THRESHOLD
|
2686 |
+
#
|
2687 |
+
xcom_objectstorage_threshold = -1
|
2688 |
+
|
2689 |
+
# Compression algorithm to use when storing XComs in object storage. Supported algorithms
|
2690 |
+
# are a.o.: snappy, zip, gzip, bz2, and lzma. If not specified, no compression will be used.
|
2691 |
+
# Note that the compression algorithm must be available in the Python installation (e.g.
|
2692 |
+
# python-snappy for snappy). Zip, gz, bz2 are available by default.
|
2693 |
+
#
|
2694 |
+
# Example: xcom_objectstorage_compression = gz
|
2695 |
+
#
|
2696 |
+
# Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_COMPRESSION
|
2697 |
+
#
|
2698 |
+
xcom_objectstorage_compression =
|
2699 |
+
|
2700 |
+
[fab]
|
2701 |
+
# This section contains configs specific to FAB provider.
|
2702 |
+
|
2703 |
+
# Boolean for enabling rate limiting on authentication endpoints.
|
2704 |
+
#
|
2705 |
+
# Variable: AIRFLOW__FAB__AUTH_RATE_LIMITED
|
2706 |
+
#
|
2707 |
+
auth_rate_limited = True
|
2708 |
+
|
2709 |
+
# Rate limit for authentication endpoints.
|
2710 |
+
#
|
2711 |
+
# Variable: AIRFLOW__FAB__AUTH_RATE_LIMIT
|
2712 |
+
#
|
2713 |
+
auth_rate_limit = 5 per 40 second
|
2714 |
+
|
2715 |
+
# Update FAB permissions and sync security manager roles
|
2716 |
+
# on webserver startup
|
2717 |
+
#
|
2718 |
+
# Variable: AIRFLOW__FAB__UPDATE_FAB_PERMS
|
2719 |
+
#
|
2720 |
+
update_fab_perms = True
|
2721 |
+
|
2722 |
+
[imap]
|
2723 |
+
# Options for IMAP provider.
|
2724 |
+
|
2725 |
+
# ssl_context =
|
2726 |
+
|
2727 |
+
[smtp_provider]
|
2728 |
+
# Options for SMTP provider.
|
2729 |
+
|
2730 |
+
# ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default"
|
2731 |
+
# which sets it to ``ssl.create_default_context()`` which provides the right balance between
|
2732 |
+
# compatibility and security, it however requires that certificates in your operating system are
|
2733 |
+
# updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public
|
2734 |
+
# keys installed on your machines. You can switch it to "none" if you want to disable checking
|
2735 |
+
# of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks
|
2736 |
+
# if your infrastructure is not sufficiently secured. It should only be set temporarily while you
|
2737 |
+
# are fixing your certificate configuration. This can be typically done by upgrading to newer
|
2738 |
+
# version of the operating system you run Airflow components on,by upgrading/refreshing proper
|
2739 |
+
# certificates in the OS or by updating certificates for your mail servers.
|
2740 |
+
#
|
2741 |
+
# If you do not set this option explicitly, it will use Airflow "email.ssl_context" configuration,
|
2742 |
+
# but if this configuration is not present, it will use "default" value.
|
2743 |
+
#
|
2744 |
+
# Example: ssl_context = default
|
2745 |
+
#
|
2746 |
+
# Variable: AIRFLOW__SMTP_PROVIDER__SSL_CONTEXT
|
2747 |
+
#
|
2748 |
+
# ssl_context =
|
2749 |
+
|
2750 |
+
# Allows overriding of the standard templated email subject line when the SmtpNotifier is used.
|
2751 |
+
# Must provide a path to the template.
|
2752 |
+
#
|
2753 |
+
# Example: templated_email_subject_path = path/to/override/email_subject.html
|
2754 |
+
#
|
2755 |
+
# Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_EMAIL_SUBJECT_PATH
|
2756 |
+
#
|
2757 |
+
# templated_email_subject_path =
|
2758 |
+
|
2759 |
+
# Allows overriding of the standard templated email path when the SmtpNotifier is used. Must provide
|
2760 |
+
# a path to the template.
|
2761 |
+
#
|
2762 |
+
# Example: templated_html_content_path = path/to/override/email.html
|
2763 |
+
#
|
2764 |
+
# Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_HTML_CONTENT_PATH
|
2765 |
+
#
|
2766 |
+
# templated_html_content_path =
|
2767 |
+
|
dags/CrawDag.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from datetime import datetime
|
3 |
+
from airflow import DAG
|
4 |
+
from airflow.operators.python import PythonOperator
|
5 |
+
from CrawDag.models import News
|
6 |
+
from CrawDag.crawling import CrawlingTask
|
7 |
+
from CrawDag.scraping import ScrapingTask
|
8 |
+
from CrawDag.saving import SavingTask
|
9 |
+
from CrawDag.sending import SendingTask
|
10 |
+
import pytz
|
11 |
+
|
12 |
+
|
13 |
+
with DAG(
|
14 |
+
dag_id = 'CrawDag',
|
15 |
+
description = 'Crawling news from multiple sources',
|
16 |
+
# start_date = datetime(2025,1,1, 0, 0 ,0, 0, tzinfo=pytz.timezone('Asia/Ho_Chi_Minh')),
|
17 |
+
schedule_interval='@monthly',
|
18 |
+
start_date = datetime.now(tz=pytz.timezone('Asia/Ho_Chi_Minh')),
|
19 |
+
# schedule_interval = '*/30 * * * *'
|
20 |
+
|
21 |
+
) as dag:
|
22 |
+
crawl_task = PythonOperator(
|
23 |
+
task_id = 'crawl_task',
|
24 |
+
python_callable = CrawlingTask('crawl_task').execute,
|
25 |
+
provide_context = True
|
26 |
+
)
|
27 |
+
|
28 |
+
scrape_task = PythonOperator(
|
29 |
+
task_id = 'scrape_task',
|
30 |
+
python_callable = ScrapingTask('scrape_task').execute,
|
31 |
+
provide_context = True
|
32 |
+
)
|
33 |
+
|
34 |
+
save_task = PythonOperator(
|
35 |
+
task_id = 'save_task',
|
36 |
+
python_callable = SavingTask('save_task').execute,
|
37 |
+
provide_context = True
|
38 |
+
)
|
39 |
+
|
40 |
+
sent_task = PythonOperator(
|
41 |
+
task_id = 'sent_task',
|
42 |
+
python_callable = SendingTask('sent_task').execute,
|
43 |
+
provide_context = True
|
44 |
+
)
|
45 |
+
|
46 |
+
# crawl_task
|
47 |
+
# crawl_task >> scrape_task
|
48 |
+
# crawl_task >> scrape_task >> save_task
|
49 |
+
crawl_task >> scrape_task >> save_task >> sent_task
|
dags/CrawDag/__init__.py
ADDED
File without changes
|
dags/CrawDag/crawling/Crawler.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from CrawDag.models import News
|
3 |
+
class Crawler(ABC):
|
4 |
+
def __init__(self, topics: dict[str: str]) -> None:
|
5 |
+
self.topics = topics
|
6 |
+
|
7 |
+
@abstractmethod
|
8 |
+
def crawl(self) -> list[News]:
|
9 |
+
pass
|
dags/CrawDag/crawling/CrawlingTask.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.models import TaskHandle, DataExchange, News
|
2 |
+
from .RssCrawler import ThanhNienCrawler, VnexpressCrawler
|
3 |
+
from .Crawler import Crawler
|
4 |
+
class CrawlingTask(TaskHandle):
|
5 |
+
task_ids = None
|
6 |
+
key = 'crawl_news'
|
7 |
+
|
8 |
+
def __init__(self, task_ids: str) -> None:
|
9 |
+
super().__init__()
|
10 |
+
CrawlingTask.task_ids = task_ids
|
11 |
+
self.sources = [
|
12 |
+
{
|
13 |
+
'source': 'vnexpress',
|
14 |
+
'type': 'rss',
|
15 |
+
'topic': {
|
16 |
+
'economic': 'https://vnexpress.net/rss/kinh-doanh.rss',
|
17 |
+
'health': 'https://vnexpress.net/rss/suc-khoe.rss',
|
18 |
+
'sport': 'https://vnexpress.net/rss/the-thao.rss',
|
19 |
+
'politic': 'https://vnexpress.net/rss/the-gioi.rss'
|
20 |
+
},
|
21 |
+
},
|
22 |
+
{
|
23 |
+
'source': 'thanhnien',
|
24 |
+
'type': 'rss',
|
25 |
+
'topic': {
|
26 |
+
'economic': 'https://thanhnien.vn/rss/kinh-te.rss',
|
27 |
+
'health': 'https://thanhnien.vn/rss/suc-khoe.rss',
|
28 |
+
'sport': 'https://thanhnien.vn/rss/the-thao.rss',
|
29 |
+
'politic': 'https://thanhnien.vn/rss/chinh-tri.rss'
|
30 |
+
},
|
31 |
+
}
|
32 |
+
]
|
33 |
+
|
34 |
+
def execute(self, **context: any):
|
35 |
+
news: list[News] = []
|
36 |
+
for source in self.sources:
|
37 |
+
if source['source'] == 'vnexpress':
|
38 |
+
crawler:Crawler = VnexpressCrawler(source['topic'])
|
39 |
+
elif source['source'] == 'thanhnien':
|
40 |
+
crawler:Crawler = ThanhNienCrawler(source['topic'])
|
41 |
+
news.extend(crawler.crawl())
|
42 |
+
news = news[:40]
|
43 |
+
dataExchange = DataExchange(context['ti'])
|
44 |
+
dataExchange.push(CrawlingTask.key, [new.to_json() for new in news])
|
45 |
+
|
46 |
+
|
dags/CrawDag/crawling/RssCrawler/ThanhNienCrawler.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.crawling.Crawler import Crawler
|
2 |
+
from CrawDag.models import News
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import requests
|
6 |
+
import pytz
|
7 |
+
import html
|
8 |
+
|
9 |
+
class ThanhNienCrawler(Crawler):
|
10 |
+
def __init__(self, topics: dict[str: str]) -> None:
|
11 |
+
super().__init__(topics)
|
12 |
+
|
13 |
+
def crawl(self) -> list[News]:
|
14 |
+
news = []
|
15 |
+
for topic in self.topics:
|
16 |
+
response = requests.get(self.topics[topic], verify=False)
|
17 |
+
soup = BeautifulSoup(response.content, 'xml')
|
18 |
+
time = datetime.now(pytz.timezone('Asia/Ho_Chi_Minh')) - timedelta(hours=24)
|
19 |
+
for item in soup.find_all('item'):
|
20 |
+
link = item.find('link').text
|
21 |
+
pub_date_text = item.find('pubDate').text
|
22 |
+
date = datetime.strptime(pub_date_text, '%a, %d %b %y %H:%M:%S %z')
|
23 |
+
title = item.find('title').text.strip()
|
24 |
+
title = html.unescape(title)
|
25 |
+
description = item.find('description').text
|
26 |
+
description = description[9:-3]
|
27 |
+
description_soup = BeautifulSoup(description, 'html.parser')
|
28 |
+
img_tag = description_soup.find('img')
|
29 |
+
image = img_tag['src'] if img_tag else None
|
30 |
+
if date >= time:
|
31 |
+
news.append(News(topic=topic, title=title, link=link, date=date, image=image))
|
32 |
+
return news
|
33 |
+
|
34 |
+
|
35 |
+
|
dags/CrawDag/crawling/RssCrawler/VnexpressCrawler.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.crawling.Crawler import Crawler
|
2 |
+
from CrawDag.models import News
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import requests
|
6 |
+
import pytz
|
7 |
+
import html
|
8 |
+
|
9 |
+
class VnexpressCrawler(Crawler):
|
10 |
+
def __init__(self, topics: dict[str: str]) -> None:
|
11 |
+
super().__init__(topics)
|
12 |
+
|
13 |
+
|
14 |
+
def crawl(self) -> list[News]:
|
15 |
+
news = []
|
16 |
+
for topic in self.topics:
|
17 |
+
response = requests.get(self.topics[topic], verify=False)
|
18 |
+
soup = BeautifulSoup(response.content, 'xml')
|
19 |
+
time = datetime.now(pytz.timezone('Asia/Ho_Chi_Minh')) - timedelta(hours=24)
|
20 |
+
for item in soup.find_all('item'):
|
21 |
+
link = item.find('link').text
|
22 |
+
pub_date_text = item.find('pubDate').text
|
23 |
+
date = datetime.strptime(pub_date_text, '%a, %d %b %Y %H:%M:%S %z')
|
24 |
+
title = item.find('title').text.strip()
|
25 |
+
title = html.unescape(title)
|
26 |
+
description = item.find('description').text
|
27 |
+
description_soup = BeautifulSoup(description, 'html.parser')
|
28 |
+
img_tag = description_soup.find('img')
|
29 |
+
image = img_tag['src'] if img_tag else None
|
30 |
+
if date >= time:
|
31 |
+
news.append(News(topic=topic, title=title, link=link, date=date, image=image))
|
32 |
+
return news
|
33 |
+
|
34 |
+
|
35 |
+
|
dags/CrawDag/crawling/RssCrawler/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .ThanhNienCrawler import ThanhNienCrawler
|
2 |
+
from .VnexpressCrawler import VnexpressCrawler
|
dags/CrawDag/crawling/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .CrawlingTask import CrawlingTask
|
dags/CrawDag/models/DataExchange.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from airflow.models import TaskInstance
|
2 |
+
|
3 |
+
class DataExchange:
|
4 |
+
def __init__(self, task_instance: TaskInstance):
|
5 |
+
self.task_instance = task_instance
|
6 |
+
|
7 |
+
def push(self, key: str, value: any):
|
8 |
+
self.task_instance.xcom_push(key=key, value=value)
|
9 |
+
|
10 |
+
def pull(self, task_ids: str, key: str) -> any:
|
11 |
+
return self.task_instance.xcom_pull(task_ids=task_ids, key=key)
|
dags/CrawDag/models/TaskHandle.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
class TaskHandle(ABC):
|
3 |
+
@abstractmethod
|
4 |
+
def execute(self, **context: any):
|
5 |
+
pass
|
dags/CrawDag/models/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .News import News
|
2 |
+
from .DataExchange import DataExchange
|
3 |
+
from .TaskHandle import TaskHandle
|
dags/CrawDag/models/news.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
class News:
|
3 |
+
def __init__(self,
|
4 |
+
topic: str = '',
|
5 |
+
title: str = '',
|
6 |
+
content: str = '',
|
7 |
+
link: str = '',
|
8 |
+
date: datetime = None,
|
9 |
+
image: str = '',
|
10 |
+
html: str = ''):
|
11 |
+
self.topic = topic
|
12 |
+
self.title = title
|
13 |
+
self.content = content
|
14 |
+
self.link = link
|
15 |
+
self.date = date
|
16 |
+
self.image = image
|
17 |
+
self.html = html
|
18 |
+
|
19 |
+
def __str__(self):
|
20 |
+
return f"Title: {self.title}, Content: {self.content}, URL: {self.url}, Date: {self.date}, Source: {self.source}"
|
21 |
+
|
22 |
+
def to_json(self):
|
23 |
+
return {
|
24 |
+
'topic': self.topic,
|
25 |
+
'title': self.title,
|
26 |
+
'content': self.content,
|
27 |
+
'link': self.link,
|
28 |
+
'date': self.date.isoformat(),
|
29 |
+
'image': self.image,
|
30 |
+
'html': self.html,
|
31 |
+
}
|
32 |
+
|
33 |
+
def __eq__(self, value: object) -> bool:
|
34 |
+
if not isinstance(value, News):
|
35 |
+
return False
|
36 |
+
return self.topic == value.topic and self.title == value.title and self.content == value.content
|
37 |
+
|
38 |
+
@classmethod
|
39 |
+
def from_json(cls, data):
|
40 |
+
"""Convert JSON data back to News object."""
|
41 |
+
data['date'] = datetime.fromisoformat(data['date']) # Convert string back to datetime
|
42 |
+
return cls(**data)
|
dags/CrawDag/saving/DataLake.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from CrawDag.models import News
|
3 |
+
class DataLake(ABC):
|
4 |
+
@abstractmethod
|
5 |
+
def save(self, listNews: list[News]) -> list[str]:
|
6 |
+
pass
|
7 |
+
|
8 |
+
@abstractmethod
|
9 |
+
def delete(self, listNewsId: list[str]) -> None:
|
10 |
+
pass
|
dags/CrawDag/saving/SavingMethod/MongoDataLake.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.saving.DataLake import DataLake
|
2 |
+
from CrawDag.models import News
|
3 |
+
from pymongo import MongoClient
|
4 |
+
from bson.objectid import ObjectId
|
5 |
+
import os
|
6 |
+
from pymongo.server_api import ServerApi
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
import logging
|
9 |
+
class MongoDataLake(DataLake):
|
10 |
+
def __init__(self) -> None:
|
11 |
+
self.database = self.__connect()
|
12 |
+
pass
|
13 |
+
|
14 |
+
def __connect(self):
|
15 |
+
uri = f"mongodb+srv://admin:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
|
16 |
+
uri = (
|
17 |
+
"mongodb+srv://{}:{}@{}/?retryWrites=true&w=majority&appName=Cluster0".format(
|
18 |
+
os.getenv("MONGO_INITDB_ROOT_USERNAME"), os.getenv("MONGO_INITDB_ROOT_PASSWORD"),
|
19 |
+
os.getenv("MONGO_HOST"),
|
20 |
+
)
|
21 |
+
)
|
22 |
+
|
23 |
+
client = MongoClient(uri, server_api=ServerApi('1'))
|
24 |
+
database = client.get_database(os.getenv("MONGO_DATABASE"))
|
25 |
+
return database
|
26 |
+
|
27 |
+
def save(self, listNews: list[News]) -> list[str]:
|
28 |
+
newsCollection = self.database.get_collection('news')
|
29 |
+
newsListIds = []
|
30 |
+
for new in listNews:
|
31 |
+
existing = newsCollection.find_one({'topic': new.topic, 'title': new.title})
|
32 |
+
if existing:
|
33 |
+
if new.content != existing['content']:
|
34 |
+
newsCollection.update_one({'_id': existing['_id']}, {'$set': new.to_json()})
|
35 |
+
newsListIds.append(str(existing['_id']))
|
36 |
+
else:
|
37 |
+
result = newsCollection.insert_one(new.to_json())
|
38 |
+
newsListIds.append(str(result.inserted_id))
|
39 |
+
|
40 |
+
return newsListIds
|
41 |
+
|
42 |
+
def delete(self, listNewsId: list[str]) -> None:
|
43 |
+
newsCollection = self.database.get_collection('news')
|
44 |
+
for newsId in listNewsId:
|
45 |
+
newsCollection.delete_one({'_id': ObjectId(newsId)})
|
dags/CrawDag/saving/SavingMethod/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .MongoDataLake import MongoDataLake
|
dags/CrawDag/saving/SavingTask.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.models import TaskHandle, DataExchange, News
|
2 |
+
from CrawDag.scraping import ScrapingTask
|
3 |
+
from CrawDag.saving.SavingMethod import MongoDataLake
|
4 |
+
from .DataLake import DataLake
|
5 |
+
class SavingTask(TaskHandle):
|
6 |
+
task_ids = None
|
7 |
+
key = 'scrape_news'
|
8 |
+
|
9 |
+
def __init__(self, task_ids: str) -> None:
|
10 |
+
super().__init__()
|
11 |
+
SavingTask.task_ids = task_ids
|
12 |
+
self.dataLake: DataLake = MongoDataLake()
|
13 |
+
|
14 |
+
def execute(self, **context: any):
|
15 |
+
dataExchange = DataExchange(context['ti'])
|
16 |
+
listNewsJson = dataExchange.pull(ScrapingTask.task_ids, ScrapingTask.key)
|
17 |
+
listNews = [News.from_json(newsJson) for newsJson in listNewsJson]
|
18 |
+
|
19 |
+
listNewsId = self.dataLake.save(listNews)
|
20 |
+
dataExchange.push(SavingTask.key, listNewsId)
|
dags/CrawDag/saving/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .SavingTask import SavingTask
|
dags/CrawDag/scraping/ScrapeMethod/ScrapeArticle.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.scraping.Scraper import Scraper
|
2 |
+
from CrawDag.models import News
|
3 |
+
from newspaper import Article
|
4 |
+
|
5 |
+
class ScrapeArticle(Scraper):
|
6 |
+
def __init__(self, listNews: list[News]) -> None:
|
7 |
+
super().__init__(listNews)
|
8 |
+
|
9 |
+
def scrape(self) -> list[News]:
|
10 |
+
newsList: list[News] = []
|
11 |
+
for news in self.listNews:
|
12 |
+
article = Article(news.link)
|
13 |
+
article.download()
|
14 |
+
article.parse()
|
15 |
+
paragraphs = article.text.split('\n')
|
16 |
+
news.content = '\n'.join([para for para in paragraphs[0:-1]]).strip()
|
17 |
+
news.html = article.html
|
18 |
+
if news.content != '' and len(news.content) > 10: # check if the content is not empty
|
19 |
+
newsList.append(news)
|
20 |
+
|
21 |
+
return newsList
|
dags/CrawDag/scraping/ScrapeMethod/ScrapeBasic.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.scraping.Scraper import Scraper
|
2 |
+
from CrawDag.models import News
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import requests
|
5 |
+
|
6 |
+
class ScrapeBasic(Scraper):
|
7 |
+
def __init__(self, listNews: list[News]) -> None:
|
8 |
+
self.listNews = listNews
|
9 |
+
|
10 |
+
def scrape(self) -> list[News]:
|
11 |
+
newsList: list[News] = []
|
12 |
+
for news in self.listNews:
|
13 |
+
response = requests.get(news.link, verify=False)
|
14 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
15 |
+
paragraphs = soup.find_all('p')
|
16 |
+
content = ' '.join([para.get_text() for para in paragraphs[0:-1]])
|
17 |
+
html = soup.find('article')
|
18 |
+
news.content = content.strip()
|
19 |
+
news.html = html
|
20 |
+
newsList.append(news)
|
21 |
+
|
22 |
+
return newsList
|
23 |
+
|
dags/CrawDag/scraping/ScrapeMethod/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .ScrapeArticle import ScrapeArticle
|
2 |
+
from .ScrapeBasic import ScrapeBasic
|
dags/CrawDag/scraping/Scraper.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from CrawDag.models import News
|
3 |
+
class Scraper(ABC):
|
4 |
+
@abstractmethod
|
5 |
+
def __init__(self, listNews: list[News]) -> None:
|
6 |
+
self.listNews = listNews
|
7 |
+
|
8 |
+
@abstractmethod
|
9 |
+
def scrape(self) -> list[News]:
|
10 |
+
pass
|
dags/CrawDag/scraping/ScrapingTask.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.models import TaskHandle, DataExchange, News
|
2 |
+
from CrawDag.crawling import CrawlingTask
|
3 |
+
from .ScrapeMethod import ScrapeArticle
|
4 |
+
from .Scraper import Scraper
|
5 |
+
class ScrapingTask(TaskHandle):
|
6 |
+
task_ids = None
|
7 |
+
key = 'scrape_news'
|
8 |
+
|
9 |
+
def __init__(self, task_ids: str) -> None:
|
10 |
+
super().__init__()
|
11 |
+
ScrapingTask.task_ids = task_ids
|
12 |
+
|
13 |
+
def execute(self, **context: any):
|
14 |
+
dataExchange = DataExchange(context['ti'])
|
15 |
+
listNewsJson = dataExchange.pull(CrawlingTask.task_ids, CrawlingTask.key)
|
16 |
+
listNews = [News.from_json(newsJson) for newsJson in listNewsJson]
|
17 |
+
|
18 |
+
newsList:list[News] = ScrapeArticle(listNews).scrape()
|
19 |
+
dataExchange.push(ScrapingTask.key, [news.to_json() for news in newsList])
|
20 |
+
|
dags/CrawDag/scraping/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .basic_scraper import scrape_basic_article, scrape_news_v2
|
2 |
+
from .ScrapingTask import ScrapingTask
|
dags/CrawDag/scraping/basic_scraper.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import requests
|
3 |
+
from CrawDag.models import News
|
4 |
+
from newspaper import Article
|
5 |
+
|
6 |
+
def clean_content(text: str) -> str:
|
7 |
+
"""Helper function to clean article content."""
|
8 |
+
return text.strip().replace("\n", " ").replace("\t", " ")
|
9 |
+
|
10 |
+
def scrape_basic_article(news: News):
|
11 |
+
response = requests.get(news.link, verify=False)
|
12 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
13 |
+
paragraphs = soup.find_all('p')
|
14 |
+
content = ' '.join([para.get_text() for para in paragraphs[0:-1]])
|
15 |
+
html = soup.find('article')
|
16 |
+
news.content = clean_content(content)
|
17 |
+
news.html = html
|
18 |
+
|
19 |
+
def scrape_news_v2(news: News):
|
20 |
+
article = Article(news.link)
|
21 |
+
article.download()
|
22 |
+
article.parse()
|
23 |
+
paragraphs = article.text.split('\n')
|
24 |
+
news.content = '\n'.join([para for para in paragraphs[0:-1]]).strip()
|
25 |
+
news.html = article.html
|
dags/CrawDag/sending/SendingTask.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from CrawDag.models import TaskHandle, DataExchange, News
|
2 |
+
from CrawDag.saving import SavingTask
|
3 |
+
from CrawDag.saving.SavingMethod import MongoDataLake
|
4 |
+
import time
|
5 |
+
import requests
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
class SendingTask(TaskHandle):
|
12 |
+
task_ids = None
|
13 |
+
key = 'send_news'
|
14 |
+
|
15 |
+
def __init__(self, task_ids: str) -> None:
|
16 |
+
super().__init__()
|
17 |
+
SendingTask.task_ids = task_ids
|
18 |
+
self.__maxRetry = 4
|
19 |
+
self.__delay = 15
|
20 |
+
|
21 |
+
def execute(self, **context: any):
|
22 |
+
dataExchange = DataExchange(context['ti'])
|
23 |
+
listNewsId = dataExchange.pull(SavingTask.task_ids, SavingTask.key)
|
24 |
+
|
25 |
+
for attempt in range(self.__maxRetry):
|
26 |
+
try:
|
27 |
+
response = requests.post(
|
28 |
+
url=os.getenv("SERVER_URL") + '/api/news/summarize',
|
29 |
+
json=listNewsId
|
30 |
+
)
|
31 |
+
if response.status_code == 202:
|
32 |
+
return
|
33 |
+
else:
|
34 |
+
print(f"Attempt {attempt + 1} failed: {response.status_code}")
|
35 |
+
time.sleep(self.__delay)
|
36 |
+
|
37 |
+
except requests.RequestException as e:
|
38 |
+
print(f"Attempt {attempt + 1} failed: {e}")
|
39 |
+
time.sleep(self.__delay)
|
40 |
+
|
41 |
+
dataLake = SavingTask(SavingTask.task_ids).dataLake
|
42 |
+
dataLake.delete(listNewsId)
|
43 |
+
print('Failed to send data to API')
|
44 |
+
raise Exception('Failed to send data to API')
|
dags/CrawDag/sending/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .SendingTask import SendingTask
|
entrypoint.sh
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -e
|
3 |
+
|
4 |
+
|
5 |
+
# Initialize Airflow database migrations
|
6 |
+
echo "Running database migrations..."
|
7 |
+
airflow db migrate
|
8 |
+
|
9 |
+
# Check if the Airflow user already exists before creating it
|
10 |
+
echo "Checking if the admin user exists..."
|
11 |
+
USER_EXISTS=$(airflow users list | grep -w "${AIRFLOW_USERNAME}" || true)
|
12 |
+
|
13 |
+
if [ -z "$USER_EXISTS" ]; then
|
14 |
+
echo "Creating Airflow admin user..."
|
15 |
+
airflow users create --username "${AIRFLOW_USERNAME}" --password "${AIRFLOW_PASSWORD}" --firstname Admin --lastname Admin --role Admin --email "${AIRFLOW_EMAIL}"
|
16 |
+
else
|
17 |
+
echo "Admin user already exists, skipping user creation."
|
18 |
+
fi
|
19 |
+
|
20 |
+
# Start Airflow services
|
21 |
+
echo "Starting Airflow webserver, scheduler, and workers..."
|
22 |
+
exec airflow webserver --port 8080 & # Start webserver in the background
|
23 |
+
exec airflow scheduler & # Start scheduler in the background
|
24 |
+
exec airflow celery worker # Start Celery worker as the main process
|
requirements.txt
ADDED
Binary file (8.33 kB). View file
|
|