akshayka commited on
Commit
f7b9fba
·
unverified ·
2 Parent(s): cbef791 f57b8b5

Merge pull request #44 from jorammutenge/basic_ops

Browse files
Files changed (1) hide show
  1. polars/04_basic_operations.py +631 -0
polars/04_basic_operations.py ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.13"
3
+ # dependencies = [
4
+ # "marimo",
5
+ # "polars==1.23.0",
6
+ # ]
7
+ # ///
8
+
9
+ import marimo
10
+
11
+ __generated_with = "0.11.13"
12
+ app = marimo.App(width="medium")
13
+
14
+
15
+ @app.cell
16
+ def _():
17
+ import marimo as mo
18
+ return (mo,)
19
+
20
+
21
+ @app.cell(hide_code=True)
22
+ def _(mo):
23
+ mo.md(
24
+ r"""
25
+ # Basic operations on data
26
+ _By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._
27
+
28
+ In this notebook, you'll learn how to perform arithmetic operations, comparisons, and conditionals on a Polars dataframe. We'll work with a DataFrame that tracks software usage by year, categorized as either Vintage (old) or Modern (new).
29
+ """
30
+ )
31
+ return
32
+
33
+
34
+ @app.cell
35
+ def _():
36
+ import polars as pl
37
+
38
+ df = pl.DataFrame(
39
+ {
40
+ "software": [
41
+ "Lotus-123",
42
+ "WordStar",
43
+ "dBase III",
44
+ "VisiCalc",
45
+ "WinZip",
46
+ "MS-DOS",
47
+ "HyperCard",
48
+ "WordPerfect",
49
+ "Excel",
50
+ "Photoshop",
51
+ "Visual Studio",
52
+ "Slack",
53
+ "Zoom",
54
+ "Notion",
55
+ "Figma",
56
+ "Spotify",
57
+ "VSCode",
58
+ "Docker",
59
+ ],
60
+ "users": [
61
+ 10000,
62
+ 4500,
63
+ 2500,
64
+ 3000,
65
+ 1800,
66
+ 17000,
67
+ 2200,
68
+ 1900,
69
+ 500000,
70
+ 12000000,
71
+ 1500000,
72
+ 3000000,
73
+ 4000000,
74
+ 2000000,
75
+ 2500000,
76
+ 4500000,
77
+ 6000000,
78
+ 3500000,
79
+ ],
80
+ "category": ["Vintage"] * 8 + ["Modern"] * 10,
81
+ "year": [
82
+ 1985,
83
+ 1980,
84
+ 1984,
85
+ 1979,
86
+ 1991,
87
+ 1981,
88
+ 1987,
89
+ 1982,
90
+ 1987,
91
+ 1990,
92
+ 1997,
93
+ 2013,
94
+ 2011,
95
+ 2016,
96
+ 2016,
97
+ 2008,
98
+ 2015,
99
+ 2013,
100
+ ],
101
+ }
102
+ )
103
+
104
+ df
105
+ return df, pl
106
+
107
+
108
+ @app.cell(hide_code=True)
109
+ def _(mo):
110
+ mo.md(
111
+ r"""
112
+ ## Arithmetic
113
+ ### Addition
114
+ Let's add 42 users to each piece of software. This means adding 42 to each value under **users**.
115
+ """
116
+ )
117
+ return
118
+
119
+
120
+ @app.cell
121
+ def _(df, pl):
122
+ df.with_columns(pl.col("users") + 42)
123
+ return
124
+
125
+
126
+ @app.cell(hide_code=True)
127
+ def _(mo):
128
+ mo.md(r"""Another way to perform the above operation is using the built-in function.""")
129
+ return
130
+
131
+
132
+ @app.cell
133
+ def _(df, pl):
134
+ df.with_columns(pl.col("users").add(42))
135
+ return
136
+
137
+
138
+ @app.cell(hide_code=True)
139
+ def _(mo):
140
+ mo.md(
141
+ r"""
142
+ ### Subtraction
143
+ Let's subtract 42 users to each piece of software.
144
+ """
145
+ )
146
+ return
147
+
148
+
149
+ @app.cell
150
+ def _(df, pl):
151
+ df.with_columns(pl.col("users") - 42)
152
+ return
153
+
154
+
155
+ @app.cell(hide_code=True)
156
+ def _(mo):
157
+ mo.md(r"""Alternatively, you could subtract like this:""")
158
+ return
159
+
160
+
161
+ @app.cell
162
+ def _(df, pl):
163
+ df.with_columns(pl.col("users").sub(42))
164
+ return
165
+
166
+
167
+ @app.cell(hide_code=True)
168
+ def _(mo):
169
+ mo.md(
170
+ r"""
171
+ ### Division
172
+ Suppose the **users** values are inflated, we can reduce them by dividing by 1000. Here's how to do it.
173
+ """
174
+ )
175
+ return
176
+
177
+
178
+ @app.cell
179
+ def _(df, pl):
180
+ df.with_columns(pl.col("users") / 1000)
181
+ return
182
+
183
+
184
+ @app.cell(hide_code=True)
185
+ def _(mo):
186
+ mo.md(r"""Or we could do it with a built-in expression.""")
187
+ return
188
+
189
+
190
+ @app.cell
191
+ def _(df, pl):
192
+ df.with_columns(pl.col("users").truediv(1000))
193
+ return
194
+
195
+
196
+ @app.cell(hide_code=True)
197
+ def _(mo):
198
+ mo.md(r"""If we didn't care about the remainder after division (i.e remove numbers after decimal point) we could do it like this.""")
199
+ return
200
+
201
+
202
+ @app.cell
203
+ def _(df, pl):
204
+ df.with_columns(pl.col("users").floordiv(1000))
205
+ return
206
+
207
+
208
+ @app.cell(hide_code=True)
209
+ def _(mo):
210
+ mo.md(
211
+ r"""
212
+ ### Multiplication
213
+ Let's pretend the *user* values are deflated and increase them by multiplying by 100.
214
+ """
215
+ )
216
+ return
217
+
218
+
219
+ @app.cell
220
+ def _(df, pl):
221
+ (df.with_columns(pl.col("users") * 100))
222
+ return
223
+
224
+
225
+ @app.cell(hide_code=True)
226
+ def _(mo):
227
+ mo.md(r"""Polars also has a built-in function for multiplication.""")
228
+ return
229
+
230
+
231
+ @app.cell
232
+ def _(df, pl):
233
+ df.with_columns(pl.col("users").mul(100))
234
+ return
235
+
236
+
237
+ @app.cell(hide_code=True)
238
+ def _(mo):
239
+ mo.md(r"""So far, we've only modified the values in an existing column. Let's create a column **decade** that will represent the years as decades. Thus 1985 will be 1980 and 2008 will be 2000.""")
240
+ return
241
+
242
+
243
+ @app.cell
244
+ def _(df, pl):
245
+ (df.with_columns(decade=pl.col("year").floordiv(10).mul(10)))
246
+ return
247
+
248
+
249
+ @app.cell(hide_code=True)
250
+ def _(mo):
251
+ mo.md(r"""We could create a new column another way as follows:""")
252
+ return
253
+
254
+
255
+ @app.cell
256
+ def _(df, pl):
257
+ df.with_columns((pl.col("year").floordiv(10).mul(10)).alias("decade"))
258
+ return
259
+
260
+
261
+ @app.cell(hide_code=True)
262
+ def _(mo):
263
+ mo.md(
264
+ r"""
265
+ **Tip**
266
+ Polars encounrages you to perform your operations as a chain. This enables you to take advantage of the query optimizer. We'll build upon the above code as a chain.
267
+
268
+ ## Comparison
269
+ ### Equal
270
+ Let's get all the software categorized as Vintage.
271
+ """
272
+ )
273
+ return
274
+
275
+
276
+ @app.cell
277
+ def _(df, pl):
278
+ (
279
+ df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
280
+ .filter(pl.col("category") == "Vintage")
281
+ )
282
+ return
283
+
284
+
285
+ @app.cell(hide_code=True)
286
+ def _(mo):
287
+ mo.md(r"""We could also do a double comparison. VisiCal is the only software that's vintage and in the decade 1970s. Let's perform this comparison operation.""")
288
+ return
289
+
290
+
291
+ @app.cell
292
+ def _(df, pl):
293
+ (
294
+ df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
295
+ .filter(pl.col("category") == "Vintage")
296
+ .filter(pl.col("decade") == 1970)
297
+ )
298
+ return
299
+
300
+
301
+ @app.cell(hide_code=True)
302
+ def _(mo):
303
+ mo.md(
304
+ r"""
305
+ We could also do this comparison in one line, if readability is not a concern
306
+
307
+ **Notice** that we must enclose the two expressions between the `&` with parenthesis.
308
+ """
309
+ )
310
+ return
311
+
312
+
313
+ @app.cell
314
+ def _(df, pl):
315
+ (
316
+ df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
317
+ .filter((pl.col("category") == "Vintage") & (pl.col("decade") == 1970))
318
+ )
319
+ return
320
+
321
+
322
+ @app.cell(hide_code=True)
323
+ def _(mo):
324
+ mo.md(r"""We can also use the built-in function for equal to comparisons.""")
325
+ return
326
+
327
+
328
+ @app.cell
329
+ def _(df, pl):
330
+ (df
331
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
332
+ .filter(pl.col('category').eq('Vintage'))
333
+ )
334
+ return
335
+
336
+
337
+ @app.cell(hide_code=True)
338
+ def _(mo):
339
+ mo.md(
340
+ r"""
341
+ ### Not equal
342
+ We can also compare if something is `not` equal to something. In this case, category is not vintage.
343
+ """
344
+ )
345
+ return
346
+
347
+
348
+ @app.cell
349
+ def _(df, pl):
350
+ (df
351
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
352
+ .filter(pl.col('category') != 'Vintage')
353
+ )
354
+ return
355
+
356
+
357
+ @app.cell(hide_code=True)
358
+ def _(mo):
359
+ mo.md(r"""Or with the built-in function.""")
360
+ return
361
+
362
+
363
+ @app.cell
364
+ def _(df, pl):
365
+ (df
366
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
367
+ .filter(pl.col('category').ne('Vintage'))
368
+ )
369
+ return
370
+
371
+
372
+ @app.cell(hide_code=True)
373
+ def _(mo):
374
+ mo.md(r"""Or if you want to be extra clever, you can use the negation symbol `~` used in logic.""")
375
+ return
376
+
377
+
378
+ @app.cell
379
+ def _(df, pl):
380
+ (df
381
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
382
+ .filter(~pl.col('category').eq('Vintage'))
383
+ )
384
+ return
385
+
386
+
387
+ @app.cell(hide_code=True)
388
+ def _(mo):
389
+ mo.md(
390
+ r"""
391
+ ### Greater than
392
+ Let's get the software where the year is greater than 2008 from the above dataframe.
393
+ """
394
+ )
395
+ return
396
+
397
+
398
+ @app.cell
399
+ def _(df, pl):
400
+ (df
401
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
402
+ .filter(~pl.col('category').eq('Vintage'))
403
+ .filter(pl.col('year') > 2008)
404
+ )
405
+ return
406
+
407
+
408
+ @app.cell(hide_code=True)
409
+ def _(mo):
410
+ mo.md(r"""Or if we wanted the year 2008 to be included, we could use great or equal to.""")
411
+ return
412
+
413
+
414
+ @app.cell
415
+ def _(df, pl):
416
+ (df
417
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
418
+ .filter(~pl.col('category').eq('Vintage'))
419
+ .filter(pl.col('year') >= 2008)
420
+ )
421
+ return
422
+
423
+
424
+ @app.cell(hide_code=True)
425
+ def _(mo):
426
+ mo.md(r"""We could do the previous two operations with built-in functions. Here's with greater than.""")
427
+ return
428
+
429
+
430
+ @app.cell
431
+ def _(df, pl):
432
+ (df
433
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
434
+ .filter(~pl.col('category').eq('Vintage'))
435
+ .filter(pl.col('year').gt(2008))
436
+ )
437
+ return
438
+
439
+
440
+ @app.cell(hide_code=True)
441
+ def _(mo):
442
+ mo.md(r"""And here's with greater or equal to""")
443
+ return
444
+
445
+
446
+ @app.cell
447
+ def _(df, pl):
448
+ (df
449
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
450
+ .filter(~pl.col('category').eq('Vintage'))
451
+ .filter(pl.col('year').ge(2008))
452
+ )
453
+ return
454
+
455
+
456
+ @app.cell(hide_code=True)
457
+ def _(mo):
458
+ mo.md(
459
+ r"""
460
+ **Note**: For "less than", and "less or equal to" you can use the operators `<` or `<=`. Alternatively, you can use built-in functions `lt` or `le` respectively.
461
+
462
+ ### Is between
463
+ Polars also allows us to filter between a range of values. Let's get the modern software were the year is between 2013 and 2016. This is inclusive on both ends (i.e. both years are part of the result).
464
+ """
465
+ )
466
+ return
467
+
468
+
469
+ @app.cell
470
+ def _(df, pl):
471
+ (df
472
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
473
+ .filter(pl.col('category').eq('Modern'))
474
+ .filter(pl.col('year').is_between(2013, 2016))
475
+ )
476
+ return
477
+
478
+
479
+ @app.cell(hide_code=True)
480
+ def _(mo):
481
+ mo.md(
482
+ r"""
483
+ ### Or operator
484
+ If we only want either one of the conditions in the comparison to be met, we could use `|`, which is the `or` operator.
485
+
486
+ Let's get software that is either modern or used in the decade 1980s.
487
+ """
488
+ )
489
+ return
490
+
491
+
492
+ @app.cell
493
+ def _(df, pl):
494
+ (df
495
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
496
+ .filter((pl.col('category') == 'Modern') | (pl.col('decade') == 1980))
497
+ )
498
+ return
499
+
500
+
501
+ @app.cell(hide_code=True)
502
+ def _(mo):
503
+ mo.md(
504
+ r"""
505
+ ## Conditionals
506
+ Polars also allows you create new columns based on a condition. Let's create a column *status* that will indicate if the software is "discontinued" or "in use".
507
+
508
+ Here's a list of products that are no longer in use.
509
+ """
510
+ )
511
+ return
512
+
513
+
514
+ @app.cell
515
+ def _():
516
+ discontinued_list = ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 'MS-DOS', 'HyperCard']
517
+ return (discontinued_list,)
518
+
519
+
520
+ @app.cell(hide_code=True)
521
+ def _(mo):
522
+ mo.md(r"""Here's how we can get a dataframe of the products that are discontinued.""")
523
+ return
524
+
525
+
526
+ @app.cell
527
+ def _(df, discontinued_list, pl):
528
+ (df
529
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
530
+ .filter(pl.col('software').is_in(discontinued_list))
531
+ )
532
+ return
533
+
534
+
535
+ @app.cell(hide_code=True)
536
+ def _(mo):
537
+ mo.md(r"""Now, let's create the **status** column.""")
538
+ return
539
+
540
+
541
+ @app.cell
542
+ def _(df, discontinued_list, pl):
543
+ (df
544
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
545
+ .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
546
+ .then(pl.lit('Discontinued'))
547
+ .otherwise(pl.lit('In use'))
548
+ .alias('status')
549
+ )
550
+ )
551
+ return
552
+
553
+
554
+ @app.cell(hide_code=True)
555
+ def _(mo):
556
+ mo.md(
557
+ r"""
558
+ ## Unique counts
559
+ Sometimes you may want to see only the unique values in a column. Let's check the unique decades we have in our DataFrame.
560
+ """
561
+ )
562
+ return
563
+
564
+
565
+ @app.cell
566
+ def _(df, discontinued_list, pl):
567
+ (df
568
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
569
+ .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
570
+ .then(pl.lit('Discontinued'))
571
+ .otherwise(pl.lit('In use'))
572
+ .alias('status')
573
+ )
574
+ .select('decade').unique()
575
+ )
576
+ return
577
+
578
+
579
+ @app.cell(hide_code=True)
580
+ def _(mo):
581
+ mo.md(r"""Finally, let's find out the number of software used in each decade.""")
582
+ return
583
+
584
+
585
+ @app.cell
586
+ def _(df, discontinued_list, pl):
587
+ (df
588
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
589
+ .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
590
+ .then(pl.lit('Discontinued'))
591
+ .otherwise(pl.lit('In use'))
592
+ .alias('status')
593
+ )
594
+ ['decade'].value_counts()
595
+ )
596
+ return
597
+
598
+
599
+ @app.cell(hide_code=True)
600
+ def _(mo):
601
+ mo.md(r"""We could also rewrite the above code as follows:""")
602
+ return
603
+
604
+
605
+ @app.cell
606
+ def _(df, discontinued_list, pl):
607
+ (df
608
+ .with_columns(decade=pl.col('year').floordiv(10).mul(10))
609
+ .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
610
+ .then(pl.lit('Discontinued'))
611
+ .otherwise(pl.lit('In use'))
612
+ .alias('status')
613
+ )
614
+ .select('decade').to_series().value_counts()
615
+ )
616
+ return
617
+
618
+
619
+ @app.cell(hide_code=True)
620
+ def _(mo):
621
+ mo.md(r"""Hopefully, we've picked your interest to try out Polars the next time you analyze your data.""")
622
+ return
623
+
624
+
625
+ @app.cell
626
+ def _():
627
+ return
628
+
629
+
630
+ if __name__ == "__main__":
631
+ app.run()