Spaces:
Running
Running
Jesse Hartman
commited on
Commit
·
15ebd87
1
Parent(s):
8cc48bf
add initial version of dataframe transformer tutorial
Browse files
polars/tutorial_dataframe_transformer.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "marimo",
|
4 |
+
# "polars==1.28.1",
|
5 |
+
# "requests==2.32.3",
|
6 |
+
# ]
|
7 |
+
# [tool.marimo.runtime]
|
8 |
+
# auto_instantiate = false
|
9 |
+
# ///
|
10 |
+
|
11 |
+
import marimo
|
12 |
+
|
13 |
+
__generated_with = "0.13.2"
|
14 |
+
app = marimo.App(width="medium")
|
15 |
+
|
16 |
+
|
17 |
+
@app.cell
|
18 |
+
def _():
|
19 |
+
import marimo as mo
|
20 |
+
import polars as pl
|
21 |
+
import requests
|
22 |
+
import json
|
23 |
+
return mo, pl, requests
|
24 |
+
|
25 |
+
|
26 |
+
@app.cell
|
27 |
+
def _(requests):
|
28 |
+
json_data = requests.get(
|
29 |
+
"https://raw.githubusercontent.com/jesshart/fake-datasets/refs/heads/main/orders.json"
|
30 |
+
)
|
31 |
+
return (json_data,)
|
32 |
+
|
33 |
+
|
34 |
+
@app.cell
|
35 |
+
def _(mo):
|
36 |
+
mo.md(
|
37 |
+
r"""
|
38 |
+
# Loading Data
|
39 |
+
Let's start by loading our data and getting into the `.lazy()` format so our transformations and queries are speedy.
|
40 |
+
|
41 |
+
Read more about `.lazy()` here: https://docs.pola.rs/user-guide/lazy/
|
42 |
+
"""
|
43 |
+
)
|
44 |
+
return
|
45 |
+
|
46 |
+
|
47 |
+
@app.cell
|
48 |
+
def _(json_data, pl):
|
49 |
+
demand: pl.LazyFrame = pl.read_json(json_data.content).lazy()
|
50 |
+
demand
|
51 |
+
return (demand,)
|
52 |
+
|
53 |
+
|
54 |
+
@app.cell
|
55 |
+
def _(mo):
|
56 |
+
mo.md(
|
57 |
+
r"""
|
58 |
+
Above, you will notice that when you reference the object as a standalone, you get out-of-the-box convenince from `marimo`. You have the `Table` and `Query Plan` options to choose from.
|
59 |
+
|
60 |
+
- 💡 Try out the `Table` view! You can click the `Preview data` button to get a quick view of your data.
|
61 |
+
- 💡 Take a look at the `Query plan`. Learn more about Polar's query plan here: https://docs.pola.rs/user-guide/lazy/query-plan/
|
62 |
+
"""
|
63 |
+
)
|
64 |
+
return
|
65 |
+
|
66 |
+
|
67 |
+
@app.cell
|
68 |
+
def _(mo):
|
69 |
+
mo.md(
|
70 |
+
r"""
|
71 |
+
# marimo's Native Dataframe UI
|
72 |
+
|
73 |
+
There are a few ways to leverage marimo's native dataframe UI. One is by doing what we saw above—by referencing a `pl.LazyFrame` directly. You can also try,
|
74 |
+
|
75 |
+
- Reference a `pl.LazyFrame` (we already did this!)
|
76 |
+
- Referencing a `pl.DataFrame` and see how it different from its corresponding lazy version
|
77 |
+
- Use `mo.ui.table`
|
78 |
+
- Use `mo.ui.dataframe`
|
79 |
+
"""
|
80 |
+
)
|
81 |
+
return
|
82 |
+
|
83 |
+
|
84 |
+
@app.cell
|
85 |
+
def _(mo):
|
86 |
+
mo.md(
|
87 |
+
r"""
|
88 |
+
## Reference a pl.DataFrame
|
89 |
+
Let's reference the same frame as before, but this time as a `pl.DataFrame` by calling `.collect()` on it.
|
90 |
+
"""
|
91 |
+
)
|
92 |
+
return
|
93 |
+
|
94 |
+
|
95 |
+
@app.cell
|
96 |
+
def _(demand):
|
97 |
+
demand.collect()
|
98 |
+
return
|
99 |
+
|
100 |
+
|
101 |
+
@app.cell
|
102 |
+
def _(mo):
|
103 |
+
mo.md(
|
104 |
+
r"""
|
105 |
+
Note how much functionality we have right out-of-the-box. Click on column names to see rich features like sorting, freezing, filtering, searching, and more!
|
106 |
+
|
107 |
+
Notice how `order_quantity` has a green bar chart under it indicating the ditribution of values for the field!
|
108 |
+
|
109 |
+
Don't miss the `Download` feature as well which supports downloading in CSV, json, or parquet format!
|
110 |
+
"""
|
111 |
+
)
|
112 |
+
return
|
113 |
+
|
114 |
+
|
115 |
+
@app.cell
|
116 |
+
def _(mo):
|
117 |
+
mo.md(
|
118 |
+
r"""
|
119 |
+
## Use `mo.ui.table`
|
120 |
+
The `mo.ui.table` allows you to select rows for use downstream. You can select the rows you want, and then use these as filtered rows downstream.
|
121 |
+
"""
|
122 |
+
)
|
123 |
+
return
|
124 |
+
|
125 |
+
|
126 |
+
@app.cell
|
127 |
+
def _(demand, mo):
|
128 |
+
demand_table = mo.ui.table(demand, label="Demand Table")
|
129 |
+
return (demand_table,)
|
130 |
+
|
131 |
+
|
132 |
+
@app.cell
|
133 |
+
def _(demand_table):
|
134 |
+
demand_table
|
135 |
+
return
|
136 |
+
|
137 |
+
|
138 |
+
@app.cell
|
139 |
+
def _(mo):
|
140 |
+
mo.md(r"""I like to use this feature to select groupings based on summary statistics so I can quickly explore subsets of categories. Let me show you what I mean.""")
|
141 |
+
return
|
142 |
+
|
143 |
+
|
144 |
+
@app.cell
|
145 |
+
def _(demand, pl):
|
146 |
+
summary: pl.LazyFrame = demand.group_by("product_family").agg(
|
147 |
+
pl.mean("order_quantity").alias("mean"),
|
148 |
+
pl.sum("order_quantity").alias("sum"),
|
149 |
+
pl.std("order_quantity").alias("std"),
|
150 |
+
pl.min("order_quantity").alias("min"),
|
151 |
+
pl.max("order_quantity").alias("max"),
|
152 |
+
pl.col("order_quantity").null_count().alias("null_count"),
|
153 |
+
)
|
154 |
+
return (summary,)
|
155 |
+
|
156 |
+
|
157 |
+
@app.cell
|
158 |
+
def _(mo, summary):
|
159 |
+
summary_table = mo.ui.table(summary)
|
160 |
+
return (summary_table,)
|
161 |
+
|
162 |
+
|
163 |
+
@app.cell
|
164 |
+
def _(summary_table):
|
165 |
+
summary_table
|
166 |
+
return
|
167 |
+
|
168 |
+
|
169 |
+
@app.cell
|
170 |
+
def _(mo):
|
171 |
+
mo.md(
|
172 |
+
r"""
|
173 |
+
Now, instead of manually creatinga filter for what I want to take a closer look at, I simply select from the ui and do a simple join to get that aggregated level with more detail.
|
174 |
+
|
175 |
+
The following cell uses the output of the `mo.ui.table` selection, selects its unique keys, and uses that to join for the selected subset of the original table.
|
176 |
+
"""
|
177 |
+
)
|
178 |
+
return
|
179 |
+
|
180 |
+
|
181 |
+
@app.cell
|
182 |
+
def _(demand, pl, summary_table):
|
183 |
+
selection_keys: pl.LazyFrame = (
|
184 |
+
summary_table.value.lazy().select("product_family").unique()
|
185 |
+
)
|
186 |
+
selection: pl.lazyframe = selection_keys.join(
|
187 |
+
demand, on="product_family", how="left"
|
188 |
+
)
|
189 |
+
selection.collect()
|
190 |
+
return
|
191 |
+
|
192 |
+
|
193 |
+
@app.cell
|
194 |
+
def _(mo):
|
195 |
+
mo.md(r"""## Use `mo.ui.dataframe`""")
|
196 |
+
return
|
197 |
+
|
198 |
+
|
199 |
+
@app.cell
|
200 |
+
def _(demand, mo):
|
201 |
+
mo_dateframe = mo.ui.dataframe(demand.collect())
|
202 |
+
return (mo_dateframe,)
|
203 |
+
|
204 |
+
|
205 |
+
@app.cell
|
206 |
+
def _(mo_dateframe):
|
207 |
+
mo_dateframe
|
208 |
+
return
|
209 |
+
|
210 |
+
|
211 |
+
@app.cell
|
212 |
+
def _():
|
213 |
+
return
|
214 |
+
|
215 |
+
|
216 |
+
if __name__ == "__main__":
|
217 |
+
app.run()
|