Jesse Hartman commited on
Commit
8f585d7
·
1 Parent(s): 09df637

make changes and updates per github feedback

Browse files
polars/tutorial_dataframe_transformer.py CHANGED
@@ -1,6 +1,8 @@
1
  # /// script
2
  # dependencies = [
3
  # "marimo",
 
 
4
  # "polars==1.28.1",
5
  # "requests==2.32.3",
6
  # ]
@@ -10,17 +12,22 @@
10
 
11
  import marimo
12
 
13
- __generated_with = "0.13.2"
14
  app = marimo.App(width="medium")
15
 
16
 
17
  @app.cell
18
- def _():
19
- import marimo as mo
20
- import polars as pl
21
- import requests
22
- import json
23
- return mo, pl, requests
 
 
 
 
 
24
 
25
 
26
  @app.cell
@@ -35,11 +42,11 @@ def _(requests):
35
  def _(mo):
36
  mo.md(
37
  r"""
38
- # Loading Data
39
- Let's start by loading our data and getting into the `.lazy()` format so our transformations and queries are speedy.
40
 
41
- Read more about `.lazy()` here: https://docs.pola.rs/user-guide/lazy/
42
- """
43
  )
44
  return
45
 
@@ -55,11 +62,11 @@ def _(json_data, pl):
55
  def _(mo):
56
  mo.md(
57
  r"""
58
- Above, you will notice that when you reference the object as a standalone, you get out-of-the-box convenince from `marimo`. You have the `Table` and `Query Plan` options to choose from.
59
 
60
- - 💡 Try out the `Table` view! You can click the `Preview data` button to get a quick view of your data.
61
- - 💡 Take a look at the `Query plan`. Learn more about Polar's query plan here: https://docs.pola.rs/user-guide/lazy/query-plan/
62
- """
63
  )
64
  return
65
 
@@ -68,15 +75,15 @@ def _(mo):
68
  def _(mo):
69
  mo.md(
70
  r"""
71
- # marimo's Native Dataframe UI
72
 
73
- There are a few ways to leverage marimo's native dataframe UI. One is by doing what we saw above—by referencing a `pl.LazyFrame` directly. You can also try,
74
 
75
- - Reference a `pl.LazyFrame` (we already did this!)
76
- - Referencing a `pl.DataFrame` and see how it different from its corresponding lazy version
77
- - Use `mo.ui.table`
78
- - Use `mo.ui.dataframe`
79
- """
80
  )
81
  return
82
 
@@ -85,15 +92,15 @@ def _(mo):
85
  def _(mo):
86
  mo.md(
87
  r"""
88
- ## Reference a pl.DataFrame
89
- Let's reference the same frame as before, but this time as a `pl.DataFrame` by calling `.collect()` on it.
90
- """
91
  )
92
  return
93
 
94
 
95
  @app.cell
96
- def _(demand):
97
  demand.collect()
98
  return
99
 
@@ -102,12 +109,12 @@ def _(demand):
102
  def _(mo):
103
  mo.md(
104
  r"""
105
- Note how much functionality we have right out-of-the-box. Click on column names to see rich features like sorting, freezing, filtering, searching, and more!
106
 
107
- Notice how `order_quantity` has a green bar chart under it indicating the ditribution of values for the field!
108
 
109
- Don't miss the `Download` feature as well which supports downloading in CSV, json, or parquet format!
110
- """
111
  )
112
  return
113
 
@@ -116,15 +123,15 @@ def _(mo):
116
  def _(mo):
117
  mo.md(
118
  r"""
119
- ## Use `mo.ui.table`
120
- The `mo.ui.table` allows you to select rows for use downstream. You can select the rows you want, and then use these as filtered rows downstream.
121
- """
122
  )
123
  return
124
 
125
 
126
  @app.cell
127
- def _(demand, mo):
128
  demand_table = mo.ui.table(demand, label="Demand Table")
129
  return (demand_table,)
130
 
@@ -137,12 +144,14 @@ def _(demand_table):
137
 
138
  @app.cell
139
  def _(mo):
140
- mo.md(r"""I like to use this feature to select groupings based on summary statistics so I can quickly explore subsets of categories. Let me show you what I mean.""")
 
 
141
  return
142
 
143
 
144
  @app.cell
145
- def _(demand, pl):
146
  summary: pl.LazyFrame = demand.group_by("product_family").agg(
147
  pl.mean("order_quantity").alias("mean"),
148
  pl.sum("order_quantity").alias("sum"),
@@ -155,7 +164,7 @@ def _(demand, pl):
155
 
156
 
157
  @app.cell
158
- def _(mo, summary):
159
  summary_table = mo.ui.table(summary)
160
  return (summary_table,)
161
 
@@ -170,16 +179,16 @@ def _(summary_table):
170
  def _(mo):
171
  mo.md(
172
  r"""
173
- Now, instead of manually creatinga filter for what I want to take a closer look at, I simply select from the ui and do a simple join to get that aggregated level with more detail.
174
 
175
- The following cell uses the output of the `mo.ui.table` selection, selects its unique keys, and uses that to join for the selected subset of the original table.
176
- """
177
  )
178
  return
179
 
180
 
181
  @app.cell
182
- def _(demand, pl, summary_table):
183
  selection_keys: pl.LazyFrame = (
184
  summary_table.value.lazy().select("product_family").unique()
185
  )
@@ -190,6 +199,14 @@ def _(demand, pl, summary_table):
190
  return
191
 
192
 
 
 
 
 
 
 
 
 
193
  @app.cell
194
  def _(mo):
195
  mo.md(r"""## Use `mo.ui.dataframe`""")
@@ -197,21 +214,143 @@ def _(mo):
197
 
198
 
199
  @app.cell
200
- def _(demand, mo):
201
- mo_dateframe = mo.ui.dataframe(demand.collect())
202
- return (mo_dateframe,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
 
205
  @app.cell
206
- def _(mo_dateframe):
207
- mo_dateframe
 
 
208
  return
209
 
210
 
211
  @app.cell
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def _():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  return
214
 
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  if __name__ == "__main__":
217
  app.run()
 
1
  # /// script
2
  # dependencies = [
3
  # "marimo",
4
+ # "numpy==2.2.3",
5
+ # "plotly[express]==6.0.0",
6
  # "polars==1.28.1",
7
  # "requests==2.32.3",
8
  # ]
 
12
 
13
  import marimo
14
 
15
+ __generated_with = "0.13.15"
16
  app = marimo.App(width="medium")
17
 
18
 
19
  @app.cell
20
+ def _(mo):
21
+ mo.md(
22
+ r"""
23
+ # Polars with Marimo's Dataframe Transformer
24
+
25
+ *By [jesshart](https://github.com/jesshart)*
26
+
27
+ The goal of this notebook is to explore Marimo's data explore capabilities alonside the power of polars. Feel free to reference the latest about this Marimo feature here: https://docs.marimo.io/api/inputs/data_explorer/
28
+ """
29
+ )
30
+ return
31
 
32
 
33
  @app.cell
 
42
  def _(mo):
43
  mo.md(
44
  r"""
45
+ # Loading Data
46
+ Let's start by loading our data and getting into the `.lazy()` format so our transformations and queries are speedy.
47
 
48
+ Read more about `.lazy()` here: https://docs.pola.rs/user-guide/lazy/
49
+ """
50
  )
51
  return
52
 
 
62
  def _(mo):
63
  mo.md(
64
  r"""
65
+ Above, you will notice that when you reference the object as a standalone, you get out-of-the-box convenince from `marimo`. You have the `Table` and `Query Plan` options to choose from.
66
 
67
+ - 💡 Try out the `Table` view! You can click the `Preview data` button to get a quick view of your data.
68
+ - 💡 Take a look at the `Query plan`. Learn more about Polar's query plan here: https://docs.pola.rs/user-guide/lazy/query-plan/
69
+ """
70
  )
71
  return
72
 
 
75
  def _(mo):
76
  mo.md(
77
  r"""
78
+ ## marimo's Native Dataframe UI
79
 
80
+ There are a few ways to leverage marimo's native dataframe UI. One is by doing what we saw above—by referencing a `pl.LazyFrame` directly. You can also try,
81
 
82
+ - Reference a `pl.LazyFrame` (we already did this!)
83
+ - Referencing a `pl.DataFrame` and see how it different from its corresponding lazy version
84
+ - Use `mo.ui.table`
85
+ - Use `mo.ui.dataframe`
86
+ """
87
  )
88
  return
89
 
 
92
  def _(mo):
93
  mo.md(
94
  r"""
95
+ ## Reference a `pl.DataFrame`
96
+ Let's reference the same frame as before, but this time as a `pl.DataFrame` by calling `.collect()` on it.
97
+ """
98
  )
99
  return
100
 
101
 
102
  @app.cell
103
+ def _(demand: "pl.LazyFrame"):
104
  demand.collect()
105
  return
106
 
 
109
  def _(mo):
110
  mo.md(
111
  r"""
112
+ Note how much functionality we have right out-of-the-box. Click on column names to see rich features like sorting, freezing, filtering, searching, and more!
113
 
114
+ Notice how `order_quantity` has a green bar chart under it indicating the ditribution of values for the field!
115
 
116
+ Don't miss the `Download` feature as well which supports downloading in CSV, json, or parquet format!
117
+ """
118
  )
119
  return
120
 
 
123
  def _(mo):
124
  mo.md(
125
  r"""
126
+ ## Use `mo.ui.table`
127
+ The `mo.ui.table` allows you to select rows for use downstream. You can select the rows you want, and then use these as filtered rows downstream.
128
+ """
129
  )
130
  return
131
 
132
 
133
  @app.cell
134
+ def _(demand: "pl.LazyFrame", mo):
135
  demand_table = mo.ui.table(demand, label="Demand Table")
136
  return (demand_table,)
137
 
 
144
 
145
  @app.cell
146
  def _(mo):
147
+ mo.md(
148
+ r"""I like to use this feature to select groupings based on summary statistics so I can quickly explore subsets of categories. Let me show you what I mean."""
149
+ )
150
  return
151
 
152
 
153
  @app.cell
154
+ def _(demand: "pl.LazyFrame", pl):
155
  summary: pl.LazyFrame = demand.group_by("product_family").agg(
156
  pl.mean("order_quantity").alias("mean"),
157
  pl.sum("order_quantity").alias("sum"),
 
164
 
165
 
166
  @app.cell
167
+ def _(mo, summary: "pl.LazyFrame"):
168
  summary_table = mo.ui.table(summary)
169
  return (summary_table,)
170
 
 
179
  def _(mo):
180
  mo.md(
181
  r"""
182
+ Now, instead of manually creatinga filter for what I want to take a closer look at, I simply select from the ui and do a simple join to get that aggregated level with more detail.
183
 
184
+ The following cell uses the output of the `mo.ui.table` selection, selects its unique keys, and uses that to join for the selected subset of the original table.
185
+ """
186
  )
187
  return
188
 
189
 
190
  @app.cell
191
+ def _(demand: "pl.LazyFrame", pl, summary_table):
192
  selection_keys: pl.LazyFrame = (
193
  summary_table.value.lazy().select("product_family").unique()
194
  )
 
199
  return
200
 
201
 
202
+ @app.cell
203
+ def _(mo):
204
+ mo.md(
205
+ """You can learn more about joins in Polars by checking out my other interactive notebook here: https://marimo.io/p/@jesshart/basic-polars-joins"""
206
+ )
207
+ return
208
+
209
+
210
  @app.cell
211
  def _(mo):
212
  mo.md(r"""## Use `mo.ui.dataframe`""")
 
214
 
215
 
216
  @app.cell
217
+ def _(demand: "pl.LazyFrame", mo):
218
+ demand_cached = demand.collect()
219
+ mo_dataframe = mo.ui.dataframe(demand_cached)
220
+ return demand_cached, mo_dataframe
221
+
222
+
223
+ @app.cell
224
+ def _(mo):
225
+ mo.md(
226
+ r"""Below I simply call the object into view. We will play with it in the following cells."""
227
+ )
228
+ return
229
+
230
+
231
+ @app.cell
232
+ def _(mo_dataframe):
233
+ mo_dataframe
234
+ return
235
 
236
 
237
  @app.cell
238
+ def _(mo):
239
+ mo.md(
240
+ r"""One way to group this data in polars code directly would be to group by product family to get the mean. This is how it is done in polars:"""
241
+ )
242
  return
243
 
244
 
245
  @app.cell
246
+ def _(demand_cached, pl):
247
+ demand_agg: pl.DataFrame = demand_cached.group_by("product_family").agg(
248
+ pl.mean("order_quantity").name.suffix("_mean")
249
+ )
250
+ demand_agg
251
+ return (demand_agg,)
252
+
253
+
254
+ @app.cell
255
+ def _(mo):
256
+ mo.md(
257
+ f"""
258
+ ## Try Before You Buy
259
+
260
+ 1. Now try to do the same summary using Marimo's `mo.ui.dataframe` object above. Also, note how your aggregated column is already renamed! Nice touch!
261
+ 2. Try (1) again but use select statements first (This is actually better polars practice anyway since it reduces the frame as you move to aggregation.)
262
+
263
+ *When you are ready, check the `Python Code` tab at the top of the table to compare your output to the answer below.*
264
+ """
265
+ )
266
+ return
267
+
268
+
269
+ @app.cell(hide_code=True)
270
  def _():
271
+ mean_code = """
272
+ This may seem verbose compared to what I came up with, but quick and dirty outputs like this are really helpful for quickly exploring the data and learning the polars library at the same time.
273
+ ```python
274
+ df_next = df
275
+ df_next = df_next.group_by(
276
+ [pl.col("product_family")], maintain_order=True
277
+ ).agg(
278
+ [
279
+ pl.col("order_date").mean().alias("order_date_mean"),
280
+ pl.col("order_quantity").mean().alias("order_quantity_mean"),
281
+ pl.col("product").mean().alias("product_mean"),
282
+ ]
283
+ )
284
+ ```
285
+ """
286
+
287
+ mean_again_code = """
288
+ ```python
289
+ df_next = df
290
+ df_next = df_next.select(["product_family", "order_quantity"])
291
+ df_next = df_next.group_by(
292
+ [pl.col("product_family")], maintain_order=True
293
+ ).agg(
294
+ [
295
+ pl.col("order_date").mean().alias("order_date_mean"),
296
+ pl.col("order_quantity").mean().alias("order_quantity_mean"),
297
+ pl.col("product").mean().alias("product_mean"),
298
+ ]
299
+ )
300
+ ```
301
+ """
302
+ return mean_again_code, mean_code
303
+
304
+
305
+ @app.cell
306
+ def _(mean_again_code, mean_code, mo):
307
+ mo.accordion(
308
+ {
309
+ "Show Code (1)": mean_code,
310
+ "Show Code (2)": mean_again_code,
311
+ }
312
+ )
313
  return
314
 
315
 
316
+ @app.cell
317
+ def _(demand_agg: "pl.DataFrame", mo, px):
318
+ bar_graph = px.bar(
319
+ demand_agg,
320
+ x="product_family",
321
+ y="order_quantity_mean",
322
+ title="Mean Quantity over Product Family",
323
+ )
324
+
325
+ note: str = """
326
+ Note: This graph will only show if the above mo_dataframe is correct!
327
+
328
+ If you want more on interactive graphs, check out https://github.com/marimo-team/learn/blob/main/polars/05_reactive_plots.py
329
+ """
330
+
331
+ mo.vstack(
332
+ [
333
+ mo.md(note),
334
+ bar_graph,
335
+ ]
336
+ )
337
+ return
338
+
339
+
340
+ @app.cell
341
+ def _():
342
+ import marimo as mo
343
+ return (mo,)
344
+
345
+
346
+ @app.cell
347
+ def _():
348
+ import polars as pl
349
+ import requests
350
+ import json
351
+ import plotly.express as px
352
+ return pl, px, requests
353
+
354
+
355
  if __name__ == "__main__":
356
  app.run()