Spaces:
Running
Running
Commit
路
2e6110c
1
Parent(s):
f68e4f8
feat: 馃殌 Add initial setup for Datalia
Browse files- Updated `.gitignore` to exclude dbt and data files.
- Revised README for improved clarity.
- Added `data` and `datalia` folder structure with essential initialization files.
- Imported essential packages in `assets.py` and defined assets for IPC data retrieval.
- Created `dbt` configurations including models for IPC data and CFFI setup.
- Included Python dependencies and configurations in `pyproject.toml`.
- Locked dependencies in `uv.lock`.
- .gitignore +9 -0
- README.md +3 -2
- data/.gitkeep +0 -0
- datalia/__init__.py +0 -0
- datalia/assets.py +25 -0
- datalia/dbt.py +12 -0
- datalia/definitions.py +18 -0
- dbt/dbt_project.yml +12 -0
- dbt/models/schema.yml +5 -0
- dbt/models/sources.yml +9 -0
- dbt/models/spain_ipc.sql +40 -0
- dbt/profiles.yml +8 -0
- pyproject.toml +24 -0
- uv.lock +0 -0
.gitignore
CHANGED
@@ -160,3 +160,12 @@ cython_debug/
|
|
160 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
#.idea/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
#.idea/
|
163 |
+
|
164 |
+
# dbt
|
165 |
+
dbt_packages/
|
166 |
+
logs/
|
167 |
+
dbt/.user.yml
|
168 |
+
|
169 |
+
# data
|
170 |
+
data/*
|
171 |
+
!data/.gitkeep
|
README.md
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
-
#
|
2 |
-
|
|
|
|
1 |
+
# Datalia 馃拑
|
2 |
+
|
3 |
+
Plataforma de datos abiertos a nivel de Espa帽a con el objetivo de unificar y armonizar informaci贸n proveniente de diferentes fuentes.
|
data/.gitkeep
ADDED
File without changes
|
datalia/__init__.py
ADDED
File without changes
|
datalia/assets.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import polars as pl
|
2 |
+
from dagster import AssetExecutionContext, RetryPolicy, asset
|
3 |
+
from dagster_dbt import DbtCliResource, dbt_assets
|
4 |
+
|
5 |
+
from .dbt import dbt_project
|
6 |
+
|
7 |
+
|
8 |
+
@dbt_assets(manifest=dbt_project.manifest_path)
|
9 |
+
def dbt(context: AssetExecutionContext, dbt: DbtCliResource):
|
10 |
+
yield from dbt.cli(["build"], context=context).stream()
|
11 |
+
|
12 |
+
|
13 |
+
@asset(
|
14 |
+
retry_policy=RetryPolicy(max_retries=5),
|
15 |
+
)
|
16 |
+
def raw_spain_ipc() -> pl.DataFrame:
|
17 |
+
"""
|
18 |
+
Raw IPC data from INE.
|
19 |
+
"""
|
20 |
+
|
21 |
+
df = pl.read_csv(
|
22 |
+
"https://www.ine.es/jaxiT3/files/t/csv_bdsc/50904.csv", separator=";"
|
23 |
+
)
|
24 |
+
|
25 |
+
return df
|
datalia/dbt.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
from dagster_dbt import DbtProject
|
4 |
+
|
5 |
+
RELATIVE_PATH_TO_MY_DBT_PROJECT = "../dbt"
|
6 |
+
|
7 |
+
dbt_project = DbtProject(
|
8 |
+
project_dir=Path(__file__)
|
9 |
+
.joinpath("..", RELATIVE_PATH_TO_MY_DBT_PROJECT)
|
10 |
+
.resolve(),
|
11 |
+
)
|
12 |
+
dbt_project.prepare_if_dev()
|
datalia/definitions.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from dagster import Definitions, load_assets_from_modules
|
4 |
+
from dagster_dbt import DbtCliResource
|
5 |
+
from dagster_duckdb_polars import DuckDBPolarsIOManager
|
6 |
+
|
7 |
+
from . import assets, dbt
|
8 |
+
|
9 |
+
DATABASE_PATH = os.getenv("DATABASE_PATH", "./data/database.duckdb")
|
10 |
+
|
11 |
+
all_assets = load_assets_from_modules([assets])
|
12 |
+
|
13 |
+
resources = {
|
14 |
+
"io_manager": DuckDBPolarsIOManager(database=DATABASE_PATH),
|
15 |
+
"dbt": DbtCliResource(project_dir=dbt.dbt_project),
|
16 |
+
}
|
17 |
+
|
18 |
+
defs = Definitions(assets=all_assets, resources=resources)
|
dbt/dbt_project.yml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "datalia"
|
2 |
+
version: "1.0.0"
|
3 |
+
|
4 |
+
profile: "default"
|
5 |
+
|
6 |
+
clean-targets:
|
7 |
+
- "target"
|
8 |
+
- "dbt_packages"
|
9 |
+
|
10 |
+
models:
|
11 |
+
datalia:
|
12 |
+
+materialized: table
|
dbt/models/schema.yml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 2
|
2 |
+
models:
|
3 |
+
- name: spain_ipc
|
4 |
+
description: 脥ndice de Precios al Consumo (IPC) en Espa帽a.
|
5 |
+
columns: []
|
dbt/models/sources.yml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 2
|
2 |
+
|
3 |
+
sources:
|
4 |
+
- name: public
|
5 |
+
tables:
|
6 |
+
- name: raw_spain_ipc
|
7 |
+
meta:
|
8 |
+
dagster:
|
9 |
+
asset_key: ["raw_spain_ipc"]
|
dbt/models/spain_ipc.sql
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
with source as (
|
2 |
+
select * from {{ source('public', 'raw_spain_ipc') }}
|
3 |
+
),
|
4 |
+
|
5 |
+
renamed as (
|
6 |
+
select
|
7 |
+
{{ adapter.quote("Clases") }} as class,
|
8 |
+
{{ adapter.quote("Tipo de dato") }} as type,
|
9 |
+
{{ adapter.quote("Periodo") }} as date,
|
10 |
+
{{ adapter.quote("Total") }} as value
|
11 |
+
from source
|
12 |
+
),
|
13 |
+
|
14 |
+
parsed as (
|
15 |
+
select
|
16 |
+
cast(strptime(REPLACE(date, 'M', '-'), '%Y-%m') as date) AS date,
|
17 |
+
class,
|
18 |
+
type,
|
19 |
+
try_cast(replace(value, ',', '.') AS FLOAT) AS value,
|
20 |
+
from renamed
|
21 |
+
),
|
22 |
+
|
23 |
+
cleaned as (
|
24 |
+
select
|
25 |
+
date,
|
26 |
+
case
|
27 |
+
when class != '脥ndice general' then split_part(class, ' ', 1)
|
28 |
+
else '0000'
|
29 |
+
end as class_id,
|
30 |
+
case
|
31 |
+
when class != '脥ndice general' then substring(class from position(' ' in class) + 1)
|
32 |
+
else '脥ndice general'
|
33 |
+
end as class_name,
|
34 |
+
value
|
35 |
+
from parsed
|
36 |
+
where type = '脥ndice'
|
37 |
+
order by date desc
|
38 |
+
)
|
39 |
+
|
40 |
+
select * from cleaned order by date desc
|
dbt/profiles.yml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
default:
|
2 |
+
outputs:
|
3 |
+
dev:
|
4 |
+
type: duckdb
|
5 |
+
path: "../data/database.duckdb"
|
6 |
+
threads: 1
|
7 |
+
|
8 |
+
target: dev
|
pyproject.toml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "datalia"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "Add your description here"
|
5 |
+
readme = "README.md"
|
6 |
+
requires-python = ">=3.12"
|
7 |
+
dependencies = [
|
8 |
+
"dagster>=1.8.6",
|
9 |
+
"duckdb>=1.1.0",
|
10 |
+
"dagster-dbt>=0.24.6",
|
11 |
+
"dagster-duckdb>=0.24.6",
|
12 |
+
"dbt-duckdb>=1.8.3",
|
13 |
+
"ipykernel>=6.29.5",
|
14 |
+
"pyarrow>=17.0.0",
|
15 |
+
"dagster-duckdb-polars>=0.24.6",
|
16 |
+
"python-slugify>=8.0.4",
|
17 |
+
]
|
18 |
+
|
19 |
+
[tool.dagster]
|
20 |
+
module_name = "datalia.definitions"
|
21 |
+
code_location_name = "datalia"
|
22 |
+
|
23 |
+
[tool.uv]
|
24 |
+
dev-dependencies = ["dagster-webserver>=1.8.6"]
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|