davidgasquez commited on
Commit
2e6110c
1 Parent(s): f68e4f8

feat: 馃殌 Add initial setup for Datalia

Browse files

- Updated `.gitignore` to exclude dbt and data files.
- Revised README for improved clarity.
- Added `data` and `datalia` folder structure with essential initialization files.
- Imported essential packages in `assets.py` and defined assets for IPC data retrieval.
- Created `dbt` configurations including models for IPC data and CFFI setup.
- Included Python dependencies and configurations in `pyproject.toml`.
- Locked dependencies in `uv.lock`.

.gitignore CHANGED
@@ -160,3 +160,12 @@ cython_debug/
160
  # and can be added to the global gitignore or merged into this file. For a more nuclear
161
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
  #.idea/
 
 
 
 
 
 
 
 
 
 
160
  # and can be added to the global gitignore or merged into this file. For a more nuclear
161
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
  #.idea/
163
+
164
+ # dbt
165
+ dbt_packages/
166
+ logs/
167
+ dbt/.user.yml
168
+
169
+ # data
170
+ data/*
171
+ !data/.gitkeep
README.md CHANGED
@@ -1,2 +1,3 @@
1
- # datalia
2
- 馃拑 Plataforma de datos abiertos a nivel de Espa帽a para unificar y armonizar informaci贸n proveniente de diferentes fuentes.
 
 
1
+ # Datalia 馃拑
2
+
3
+ Plataforma de datos abiertos a nivel de Espa帽a con el objetivo de unificar y armonizar informaci贸n proveniente de diferentes fuentes.
data/.gitkeep ADDED
File without changes
datalia/__init__.py ADDED
File without changes
datalia/assets.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+ from dagster import AssetExecutionContext, RetryPolicy, asset
3
+ from dagster_dbt import DbtCliResource, dbt_assets
4
+
5
+ from .dbt import dbt_project
6
+
7
+
8
+ @dbt_assets(manifest=dbt_project.manifest_path)
9
+ def dbt(context: AssetExecutionContext, dbt: DbtCliResource):
10
+ yield from dbt.cli(["build"], context=context).stream()
11
+
12
+
13
+ @asset(
14
+ retry_policy=RetryPolicy(max_retries=5),
15
+ )
16
+ def raw_spain_ipc() -> pl.DataFrame:
17
+ """
18
+ Raw IPC data from INE.
19
+ """
20
+
21
+ df = pl.read_csv(
22
+ "https://www.ine.es/jaxiT3/files/t/csv_bdsc/50904.csv", separator=";"
23
+ )
24
+
25
+ return df
datalia/dbt.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from dagster_dbt import DbtProject
4
+
5
+ RELATIVE_PATH_TO_MY_DBT_PROJECT = "../dbt"
6
+
7
+ dbt_project = DbtProject(
8
+ project_dir=Path(__file__)
9
+ .joinpath("..", RELATIVE_PATH_TO_MY_DBT_PROJECT)
10
+ .resolve(),
11
+ )
12
+ dbt_project.prepare_if_dev()
datalia/definitions.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dagster import Definitions, load_assets_from_modules
4
+ from dagster_dbt import DbtCliResource
5
+ from dagster_duckdb_polars import DuckDBPolarsIOManager
6
+
7
+ from . import assets, dbt
8
+
9
+ DATABASE_PATH = os.getenv("DATABASE_PATH", "./data/database.duckdb")
10
+
11
+ all_assets = load_assets_from_modules([assets])
12
+
13
+ resources = {
14
+ "io_manager": DuckDBPolarsIOManager(database=DATABASE_PATH),
15
+ "dbt": DbtCliResource(project_dir=dbt.dbt_project),
16
+ }
17
+
18
+ defs = Definitions(assets=all_assets, resources=resources)
dbt/dbt_project.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "datalia"
2
+ version: "1.0.0"
3
+
4
+ profile: "default"
5
+
6
+ clean-targets:
7
+ - "target"
8
+ - "dbt_packages"
9
+
10
+ models:
11
+ datalia:
12
+ +materialized: table
dbt/models/schema.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ version: 2
2
+ models:
3
+ - name: spain_ipc
4
+ description: 脥ndice de Precios al Consumo (IPC) en Espa帽a.
5
+ columns: []
dbt/models/sources.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ version: 2
2
+
3
+ sources:
4
+ - name: public
5
+ tables:
6
+ - name: raw_spain_ipc
7
+ meta:
8
+ dagster:
9
+ asset_key: ["raw_spain_ipc"]
dbt/models/spain_ipc.sql ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ with source as (
2
+ select * from {{ source('public', 'raw_spain_ipc') }}
3
+ ),
4
+
5
+ renamed as (
6
+ select
7
+ {{ adapter.quote("Clases") }} as class,
8
+ {{ adapter.quote("Tipo de dato") }} as type,
9
+ {{ adapter.quote("Periodo") }} as date,
10
+ {{ adapter.quote("Total") }} as value
11
+ from source
12
+ ),
13
+
14
+ parsed as (
15
+ select
16
+ cast(strptime(REPLACE(date, 'M', '-'), '%Y-%m') as date) AS date,
17
+ class,
18
+ type,
19
+ try_cast(replace(value, ',', '.') AS FLOAT) AS value,
20
+ from renamed
21
+ ),
22
+
23
+ cleaned as (
24
+ select
25
+ date,
26
+ case
27
+ when class != '脥ndice general' then split_part(class, ' ', 1)
28
+ else '0000'
29
+ end as class_id,
30
+ case
31
+ when class != '脥ndice general' then substring(class from position(' ' in class) + 1)
32
+ else '脥ndice general'
33
+ end as class_name,
34
+ value
35
+ from parsed
36
+ where type = '脥ndice'
37
+ order by date desc
38
+ )
39
+
40
+ select * from cleaned order by date desc
dbt/profiles.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ default:
2
+ outputs:
3
+ dev:
4
+ type: duckdb
5
+ path: "../data/database.duckdb"
6
+ threads: 1
7
+
8
+ target: dev
pyproject.toml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "datalia"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "dagster>=1.8.6",
9
+ "duckdb>=1.1.0",
10
+ "dagster-dbt>=0.24.6",
11
+ "dagster-duckdb>=0.24.6",
12
+ "dbt-duckdb>=1.8.3",
13
+ "ipykernel>=6.29.5",
14
+ "pyarrow>=17.0.0",
15
+ "dagster-duckdb-polars>=0.24.6",
16
+ "python-slugify>=8.0.4",
17
+ ]
18
+
19
+ [tool.dagster]
20
+ module_name = "datalia.definitions"
21
+ code_location_name = "datalia"
22
+
23
+ [tool.uv]
24
+ dev-dependencies = ["dagster-webserver>=1.8.6"]
uv.lock ADDED
The diff for this file is too large to render. See raw diff