koaning commited on
Commit
4139d6f
·
verified ·
1 Parent(s): c834fd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -13
app.py CHANGED
@@ -11,7 +11,7 @@
11
 
12
  import marimo
13
 
14
- __generated_with = "0.13.7"
15
  app = marimo.App(width="full")
16
 
17
 
@@ -23,13 +23,24 @@ def _():
23
  from pathlib import Path
24
  from pyiceberg.partitioning import PartitionSpec, PartitionField
25
  from pyiceberg.transforms import IdentityTransform
26
- return IdentityTransform, PartitionField, PartitionSpec, mo, pl
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  @app.cell
30
- def _():
31
  from pyiceberg.catalog import load_catalog
32
 
 
 
33
  warehouse_path = "warehouse"
34
  catalog = load_catalog(
35
  "default",
@@ -43,9 +54,10 @@ def _():
43
 
44
 
45
  @app.cell
46
- def _(pl):
47
- df_taxi = pl.read_csv("yellow_tripdata_2015-01.csv.zip").to_arrow()
48
- return (df_taxi,)
 
49
 
50
 
51
  @app.cell
@@ -100,40 +112,63 @@ def _(catalog):
100
  return
101
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  @app.cell
104
  def _(pl):
105
- df_orig = pl.read_csv("yellow_tripdata_2015-01.csv.zip")
106
- df_orig.group_by("passenger_count").len().sort("passenger_count").collect()
107
  return
108
 
109
 
110
  @app.cell
111
  def _(pl):
112
- df_orig.group_by("passenger_count").len().sort("passenger_count")
113
  return
114
 
115
 
116
  @app.cell(hide_code=True)
117
  def _(mo):
118
- mo.md(r"""The partition is great, but the comparison with `read_csv` is a bit unfair. Let's convert the `.csv` file to `.parquet` and also add a partition in polars with statistics. """)
 
 
 
 
 
 
119
  return
120
 
121
 
122
  @app.cell
123
- def _(pl):
124
  df_orig.write_parquet("taxi.parquet", partition_by=["passenger_count"], statistics=True)
125
  return
126
 
127
 
128
  @app.cell
129
  def _(pl):
130
- df_orig.group_by("passenger_count").len().sort("passenger_count").collect()
131
  return
132
 
133
 
134
  @app.cell
135
  def _(pl):
136
- df_orig.group_by("passenger_count").len().sort("passenger_count")
 
 
 
 
 
 
137
  return
138
 
139
 
 
11
 
12
  import marimo
13
 
14
+ __generated_with = "0.13.8"
15
  app = marimo.App(width="full")
16
 
17
 
 
23
  from pathlib import Path
24
  from pyiceberg.partitioning import PartitionSpec, PartitionField
25
  from pyiceberg.transforms import IdentityTransform
26
+ from zipfile import ZipFile
27
+ return (
28
+ IdentityTransform,
29
+ PartitionField,
30
+ PartitionSpec,
31
+ Path,
32
+ ZipFile,
33
+ mo,
34
+ pl,
35
+ )
36
 
37
 
38
  @app.cell
39
+ def _(Path):
40
  from pyiceberg.catalog import load_catalog
41
 
42
+ Path("warehouse").mkdir(exist_ok=True, parents=True)
43
+
44
  warehouse_path = "warehouse"
45
  catalog = load_catalog(
46
  "default",
 
54
 
55
 
56
  @app.cell
57
+ def _(ZipFile, pl):
58
+ df_orig = pl.read_csv(ZipFile("yellow_tripdata_2015-01.csv.zip").open("yellow_tripdata_2015-01.csv").read())
59
+ df_taxi = df_orig.to_arrow()
60
+ return df_orig, df_taxi
61
 
62
 
63
  @app.cell
 
112
  return
113
 
114
 
115
+ @app.cell(hide_code=True)
116
+ def _(mo):
117
+ mo.md(r"""Let's write the original zipped file into a csv file. We can read this and perform the same query to compare speeds.""")
118
+ return
119
+
120
+
121
+ @app.cell
122
+ def _(df_orig):
123
+ df_orig.write_csv("taxi.csv")
124
+ return
125
+
126
+
127
  @app.cell
128
  def _(pl):
129
+ pl.scan_csv("taxi.csv").group_by("passenger_count").len().sort("passenger_count").collect()
 
130
  return
131
 
132
 
133
  @app.cell
134
  def _(pl):
135
+ pl.read_csv("taxi.csv").group_by("passenger_count").len().sort("passenger_count")
136
  return
137
 
138
 
139
  @app.cell(hide_code=True)
140
  def _(mo):
141
+ mo.md(
142
+ r"""
143
+ That's a bunch slower!
144
+
145
+ A part of the reason is that iceberg had partitions in it, which is great, but the comparison with `read_csv` is a bit unfair. Let's convert the `.csv` file to `.parquet` and also add a partition in polars with statistics. You will now see that we get a similar performance.
146
+ """
147
+ )
148
  return
149
 
150
 
151
  @app.cell
152
+ def _(df_orig):
153
  df_orig.write_parquet("taxi.parquet", partition_by=["passenger_count"], statistics=True)
154
  return
155
 
156
 
157
  @app.cell
158
  def _(pl):
159
+ pl.scan_parquet("taxi.parquet").group_by("passenger_count").len().sort("passenger_count").collect()
160
  return
161
 
162
 
163
  @app.cell
164
  def _(pl):
165
+ pl.read_parquet("taxi.parquet").group_by("passenger_count").len().sort("passenger_count")
166
+ return
167
+
168
+
169
+ @app.cell(hide_code=True)
170
+ def _(mo):
171
+ mo.md(r"""So keep in mind that polars can for sure also speed things up if you are aware of what you are doing. But one nice thing about iceberg is that can be seen as a catalogue with *a bunch* of good habbits for performance later down the line.""")
172
  return
173
 
174