Spaces:
Running
Running
Joram Mutenge
commited on
Commit
·
e769793
1
Parent(s):
f7b9fba
added aggregate notebook
Browse files- polars/12_aggregations.py +360 -0
polars/12_aggregations.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# requires-python = ">=3.13"
|
3 |
+
# dependencies = [
|
4 |
+
# "marimo",
|
5 |
+
# "polars==1.23.0",
|
6 |
+
# ]
|
7 |
+
# ///
|
8 |
+
|
9 |
+
import marimo
|
10 |
+
|
11 |
+
__generated_with = "0.11.13"
|
12 |
+
app = marimo.App(width="medium")
|
13 |
+
|
14 |
+
|
15 |
+
@app.cell
|
16 |
+
def _():
|
17 |
+
import marimo as mo
|
18 |
+
return (mo,)
|
19 |
+
|
20 |
+
|
21 |
+
@app.cell(hide_code=True)
|
22 |
+
def _(mo):
|
23 |
+
mo.md(
|
24 |
+
r"""
|
25 |
+
# Aggregations
|
26 |
+
_By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._
|
27 |
+
|
28 |
+
In this notebook, you'll learn how to perform different types of aggregations in Polars, including grouping by categories and time. We'll analyze sales data from a clothing store, focusing on three product categories: hats, socks, and sweaters.
|
29 |
+
"""
|
30 |
+
)
|
31 |
+
return
|
32 |
+
|
33 |
+
|
34 |
+
@app.cell
|
35 |
+
def _():
|
36 |
+
import polars as pl
|
37 |
+
|
38 |
+
df = (pl.read_csv('https://raw.githubusercontent.com/jorammutenge/learn-rust/refs/heads/main/sample_sales.csv', try_parse_dates=True)
|
39 |
+
.rename(lambda col: col.replace(' ','_').lower())
|
40 |
+
)
|
41 |
+
df
|
42 |
+
return df, pl
|
43 |
+
|
44 |
+
|
45 |
+
@app.cell(hide_code=True)
|
46 |
+
def _(mo):
|
47 |
+
mo.md(
|
48 |
+
r"""
|
49 |
+
## Grouping by category
|
50 |
+
### With single category
|
51 |
+
Let's find out how many of each product category we sold.
|
52 |
+
"""
|
53 |
+
)
|
54 |
+
return
|
55 |
+
|
56 |
+
|
57 |
+
@app.cell
|
58 |
+
def _(df, pl):
|
59 |
+
(df
|
60 |
+
.group_by('category')
|
61 |
+
.agg(pl.sum('quantity'))
|
62 |
+
)
|
63 |
+
return
|
64 |
+
|
65 |
+
|
66 |
+
@app.cell(hide_code=True)
|
67 |
+
def _(mo):
|
68 |
+
mo.md(
|
69 |
+
r"""
|
70 |
+
It looks like we sold more sweaters. Maybe this was a winter season.
|
71 |
+
|
72 |
+
Let's add another aggregate to see how much was spent on the total units for each product.
|
73 |
+
"""
|
74 |
+
)
|
75 |
+
return
|
76 |
+
|
77 |
+
|
78 |
+
@app.cell
|
79 |
+
def _(df, pl):
|
80 |
+
(df
|
81 |
+
.group_by('category')
|
82 |
+
.agg(pl.sum('quantity'),
|
83 |
+
pl.sum('ext_price'))
|
84 |
+
)
|
85 |
+
return
|
86 |
+
|
87 |
+
|
88 |
+
@app.cell(hide_code=True)
|
89 |
+
def _(mo):
|
90 |
+
mo.md(r"""We could also write aggregate code for the two columns as a single line.""")
|
91 |
+
return
|
92 |
+
|
93 |
+
|
94 |
+
@app.cell
|
95 |
+
def _(df, pl):
|
96 |
+
(df
|
97 |
+
.group_by('category')
|
98 |
+
.agg(pl.sum('quantity','ext_price'))
|
99 |
+
)
|
100 |
+
return
|
101 |
+
|
102 |
+
|
103 |
+
@app.cell(hide_code=True)
|
104 |
+
def _(mo):
|
105 |
+
mo.md(r"""Actually, the way we've been writing the aggregate lines is syntactic sugar. Here's a longer way of doing it as shown in the Polars documentation.""")
|
106 |
+
return
|
107 |
+
|
108 |
+
|
109 |
+
@app.cell
|
110 |
+
def _(df, pl):
|
111 |
+
(df
|
112 |
+
.group_by('category')
|
113 |
+
.agg(pl.col('quantity').sum(),
|
114 |
+
pl.col('ext_price').sum())
|
115 |
+
)
|
116 |
+
return
|
117 |
+
|
118 |
+
|
119 |
+
@app.cell(hide_code=True)
|
120 |
+
def _(mo):
|
121 |
+
mo.md(
|
122 |
+
r"""
|
123 |
+
### With multiple categories
|
124 |
+
We can also group by multiple categories. Let's find out how many items we sold in each product category for each SKU. This more detailed aggregation will produce more rows than the previous DataFrame.
|
125 |
+
"""
|
126 |
+
)
|
127 |
+
return
|
128 |
+
|
129 |
+
|
130 |
+
@app.cell
|
131 |
+
def _(df, pl):
|
132 |
+
(df
|
133 |
+
.group_by('category','sku')
|
134 |
+
.agg(pl.sum('quantity'))
|
135 |
+
)
|
136 |
+
return
|
137 |
+
|
138 |
+
|
139 |
+
@app.cell(hide_code=True)
|
140 |
+
def _(mo):
|
141 |
+
mo.md(
|
142 |
+
r"""
|
143 |
+
Aggregations when grouping data are not limited to sums. You can also use functions like `max`, `min`, `median`, `first`, and `last`.
|
144 |
+
|
145 |
+
Let's find the largest sale quantity for each product category.
|
146 |
+
"""
|
147 |
+
)
|
148 |
+
return
|
149 |
+
|
150 |
+
|
151 |
+
@app.cell
|
152 |
+
def _(df, pl):
|
153 |
+
(df
|
154 |
+
.group_by('category')
|
155 |
+
.agg(pl.max('quantity'))
|
156 |
+
)
|
157 |
+
return
|
158 |
+
|
159 |
+
|
160 |
+
@app.cell(hide_code=True)
|
161 |
+
def _(mo):
|
162 |
+
mo.md(
|
163 |
+
r"""
|
164 |
+
Let's make the aggregation more interesting. We'll identify the first customer to purchase each item, along with the quantity they bought and the amount they spent.
|
165 |
+
|
166 |
+
**Note:** To make this work, we'll have to sort the date from earliest to latest.
|
167 |
+
"""
|
168 |
+
)
|
169 |
+
return
|
170 |
+
|
171 |
+
|
172 |
+
@app.cell
|
173 |
+
def _(df, pl):
|
174 |
+
(df
|
175 |
+
.sort('date')
|
176 |
+
.group_by('category')
|
177 |
+
.agg(pl.first('account_name','quantity','ext_price'))
|
178 |
+
)
|
179 |
+
return
|
180 |
+
|
181 |
+
|
182 |
+
@app.cell(hide_code=True)
|
183 |
+
def _(mo):
|
184 |
+
mo.md(
|
185 |
+
r"""
|
186 |
+
## Grouping by time
|
187 |
+
Since `datetime` is a special data type in Polars, we can perform various group-by aggregations on it.
|
188 |
+
|
189 |
+
Our dataset spans a two-year period. Let's calculate the total dollar sales for each year. We'll do it the naive way first so you can appreciate grouping with time.
|
190 |
+
"""
|
191 |
+
)
|
192 |
+
return
|
193 |
+
|
194 |
+
|
195 |
+
@app.cell
|
196 |
+
def _(df, pl):
|
197 |
+
(df
|
198 |
+
.with_columns(year=pl.col('date').dt.year())
|
199 |
+
.group_by('year')
|
200 |
+
.agg(pl.sum('ext_price').round(2))
|
201 |
+
)
|
202 |
+
return
|
203 |
+
|
204 |
+
|
205 |
+
@app.cell(hide_code=True)
|
206 |
+
def _(mo):
|
207 |
+
mo.md(
|
208 |
+
r"""
|
209 |
+
We had more sales in 2014.
|
210 |
+
|
211 |
+
Now let's perform the above operation by groupin with time. This requires sorting the dataframe first.
|
212 |
+
"""
|
213 |
+
)
|
214 |
+
return
|
215 |
+
|
216 |
+
|
217 |
+
@app.cell
|
218 |
+
def _(df, pl):
|
219 |
+
(df
|
220 |
+
.sort('date')
|
221 |
+
.group_by_dynamic('date', every='1y')
|
222 |
+
.agg(pl.sum('ext_price'))
|
223 |
+
)
|
224 |
+
return
|
225 |
+
|
226 |
+
|
227 |
+
@app.cell(hide_code=True)
|
228 |
+
def _(mo):
|
229 |
+
mo.md(
|
230 |
+
r"""
|
231 |
+
The beauty of grouping with time is that it allows us to resample the data by selecting whatever time interval we want.
|
232 |
+
|
233 |
+
Let's find out what the quarterly sales were for 2014
|
234 |
+
"""
|
235 |
+
)
|
236 |
+
return
|
237 |
+
|
238 |
+
|
239 |
+
@app.cell
|
240 |
+
def _(df, pl):
|
241 |
+
(df
|
242 |
+
.filter(pl.col('date').dt.year() == 2014)
|
243 |
+
.sort('date')
|
244 |
+
.group_by_dynamic('date', every='1q')
|
245 |
+
.agg(pl.sum('ext_price'))
|
246 |
+
)
|
247 |
+
return
|
248 |
+
|
249 |
+
|
250 |
+
@app.cell(hide_code=True)
|
251 |
+
def _(mo):
|
252 |
+
mo.md(
|
253 |
+
r"""
|
254 |
+
Here's an interesting question we can answer that takes advantage of grouping by time.
|
255 |
+
|
256 |
+
Let's find the hour of the day where we had the most sales in dollars.
|
257 |
+
"""
|
258 |
+
)
|
259 |
+
return
|
260 |
+
|
261 |
+
|
262 |
+
@app.cell
|
263 |
+
def _(df, pl):
|
264 |
+
(df
|
265 |
+
.sort('date')
|
266 |
+
.group_by_dynamic('date', every='1h')
|
267 |
+
.agg(pl.max('ext_price'))
|
268 |
+
.filter(pl.col('ext_price') == pl.col('ext_price').max())
|
269 |
+
)
|
270 |
+
return
|
271 |
+
|
272 |
+
|
273 |
+
@app.cell(hide_code=True)
|
274 |
+
def _(mo):
|
275 |
+
mo.md(r"""Just for fun, let's find the median number of items sold in each SKU and the total dollar amount in each SKU every six days.""")
|
276 |
+
return
|
277 |
+
|
278 |
+
|
279 |
+
@app.cell
|
280 |
+
def _(df, pl):
|
281 |
+
(df
|
282 |
+
.sort('date')
|
283 |
+
.group_by_dynamic('date', every='6d')
|
284 |
+
.agg(pl.first('sku'),
|
285 |
+
pl.median('quantity'),
|
286 |
+
pl.sum('ext_price'))
|
287 |
+
)
|
288 |
+
return
|
289 |
+
|
290 |
+
|
291 |
+
@app.cell(hide_code=True)
|
292 |
+
def _(mo):
|
293 |
+
mo.md(r"""Let's rename the columns to clearly indicate the type of aggregation performed. This will help us identify the aggregation method used on a column without needing to check the code.""")
|
294 |
+
return
|
295 |
+
|
296 |
+
|
297 |
+
@app.cell
|
298 |
+
def _(df, pl):
|
299 |
+
(df
|
300 |
+
.sort('date')
|
301 |
+
.group_by_dynamic('date', every='6d')
|
302 |
+
.agg(pl.first('sku'),
|
303 |
+
pl.median('quantity').alias('median_qty'),
|
304 |
+
pl.sum('ext_price').alias('total_dollars'))
|
305 |
+
)
|
306 |
+
return
|
307 |
+
|
308 |
+
|
309 |
+
@app.cell(hide_code=True)
|
310 |
+
def _(mo):
|
311 |
+
mo.md(
|
312 |
+
r"""
|
313 |
+
## Grouping with over
|
314 |
+
|
315 |
+
Sometimes, we may want to perform an aggregation but also keep all the columns and rows of the dataframe.
|
316 |
+
|
317 |
+
Let's assign a value to indicate the number of times each customer visited and bought something.
|
318 |
+
"""
|
319 |
+
)
|
320 |
+
return
|
321 |
+
|
322 |
+
|
323 |
+
@app.cell
|
324 |
+
def _(df, pl):
|
325 |
+
(df
|
326 |
+
.with_columns(buy_freq=pl.col('account_name').len().over('account_name'))
|
327 |
+
)
|
328 |
+
return
|
329 |
+
|
330 |
+
|
331 |
+
@app.cell(hide_code=True)
|
332 |
+
def _(mo):
|
333 |
+
mo.md(r"""Finally, let's determine which customers visited the store the most and bought something.""")
|
334 |
+
return
|
335 |
+
|
336 |
+
|
337 |
+
@app.cell
|
338 |
+
def _(df, pl):
|
339 |
+
(df
|
340 |
+
.with_columns(buy_freq=pl.col('account_name').len().over('account_name'))
|
341 |
+
.filter(pl.col('buy_freq') == pl.col('buy_freq').max())
|
342 |
+
.select('account_name','buy_freq')
|
343 |
+
.unique()
|
344 |
+
)
|
345 |
+
return
|
346 |
+
|
347 |
+
|
348 |
+
@app.cell(hide_code=True)
|
349 |
+
def _(mo):
|
350 |
+
mo.md(r"""There's more you can do with aggregations in Polars. We hope that in this notebook, we've armed you with the tools to get started.""")
|
351 |
+
return
|
352 |
+
|
353 |
+
|
354 |
+
@app.cell
|
355 |
+
def _():
|
356 |
+
return
|
357 |
+
|
358 |
+
|
359 |
+
if __name__ == "__main__":
|
360 |
+
app.run()
|