Akshay Agrawal commited on
Commit
5e743f8
·
1 Parent(s): 7b36ccd

author: attribute peter-gy

Browse files

And nit for hiding markdown code.

Files changed (2) hide show
  1. polars/09_strings.py +35 -35
  2. polars/README.md +1 -0
polars/09_strings.py CHANGED
@@ -14,7 +14,7 @@ __generated_with = "0.11.17"
14
  app = marimo.App(width="medium")
15
 
16
 
17
- @app.cell
18
  def _(mo):
19
  mo.md(
20
  r"""
@@ -30,7 +30,7 @@ def _(mo):
30
  return
31
 
32
 
33
- @app.cell
34
  def _(mo):
35
  mo.md(
36
  r"""
@@ -43,7 +43,7 @@ def _(mo):
43
  return
44
 
45
 
46
- @app.cell(hide_code=True)
47
  def _(pl):
48
  pip_metadata_raw_df = pl.DataFrame(
49
  [
@@ -56,7 +56,7 @@ def _(pl):
56
  return (pip_metadata_raw_df,)
57
 
58
 
59
- @app.cell
60
  def _(mo):
61
  mo.md(r"""We can use the [`json_decode`](https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.json_decode.html) expression to parse the raw JSON strings into Polars-native structs and we can use the [unnest](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html) dataframe operation to have a dedicated column per parsed attribute.""")
62
  return
@@ -69,13 +69,13 @@ def _(pip_metadata_raw_df, pl):
69
  return (pip_metadata_df,)
70
 
71
 
72
- @app.cell
73
  def _(mo):
74
  mo.md(r"""This is already a much friendlier representation of the data we started out with, but note that since the JSON entries had only string attributes, all values are strings, even the temporal `released_at` and numerical `size_mb` columns.""")
75
  return
76
 
77
 
78
- @app.cell
79
  def _(mo):
80
  mo.md(r"""As we know that the `size_mb` column should have a decimal representation, we go ahead and use [`to_decimal`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_decimal.html#polars.Expr.str.to_decimal) to perform the conversion.""")
81
  return
@@ -91,7 +91,7 @@ def _(pip_metadata_df, pl):
91
  return
92
 
93
 
94
- @app.cell
95
  def _(mo):
96
  mo.md(
97
  r"""
@@ -127,7 +127,7 @@ def _(pip_metadata_df, pl):
127
  return
128
 
129
 
130
- @app.cell
131
  def _(mo):
132
  mo.md(r"""Alternatively, instead of using three different functions to perform the conversion to date, we can use a single one, [`strptime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strptime.html) which takes the desired temporal data type as its first parameter.""")
133
  return
@@ -145,7 +145,7 @@ def _(pip_metadata_df, pl):
145
  return
146
 
147
 
148
- @app.cell
149
  def _(mo):
150
  mo.md(r"""And to wrap up this section on parsing and conversion, let's consider a final scenario. What if we don't want to parse the entire raw JSON string, because we only need a subset of its attributes? Well, in this case we can leverage the [`json_path_match`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.json_path_match.html) expression to extract only the desired attributes using standard [JSONPath](https://goessner.net/articles/JsonPath/) syntax.""")
151
  return
@@ -163,7 +163,7 @@ def _(pip_metadata_raw_df, pl):
163
  return
164
 
165
 
166
- @app.cell
167
  def _(mo):
168
  mo.md(
169
  r"""
@@ -217,7 +217,7 @@ def _(pl):
217
  return expressions_df, list_expr_meta, list_members
218
 
219
 
220
- @app.cell
221
  def _(mo):
222
  mo.md(r"""As the following visualization shows, `str` is one of the richest Polars expression namespaces with multiple dozens of functions in it.""")
223
  return
@@ -232,7 +232,7 @@ def _(alt, expressions_df):
232
  return
233
 
234
 
235
- @app.cell
236
  def _(mo):
237
  mo.md(
238
  r"""
@@ -260,7 +260,7 @@ def _(expressions_df, pl):
260
  return (docstring_length_df,)
261
 
262
 
263
- @app.cell
264
  def _(mo):
265
  mo.md(r"""As the dataframe preview above and the scatterplot below show, the docstring length measured in bytes is almost always bigger than the length expressed in characters. This is due to the fact that the docstrings include characters which require more than a single byte to represent, such as "╞" for displaying dataframe header and body separators.""")
266
  return
@@ -276,7 +276,7 @@ def _(alt, docstring_length_df):
276
  return
277
 
278
 
279
- @app.cell
280
  def _(mo):
281
  mo.md(
282
  r"""
@@ -298,7 +298,7 @@ def _(expressions_df, pl):
298
  return
299
 
300
 
301
- @app.cell
302
  def _(mo):
303
  mo.md(
304
  r"""
@@ -338,7 +338,7 @@ def _(mo, padded_df, padding):
338
  return
339
 
340
 
341
- @app.cell
342
  def _(mo):
343
  mo.md(
344
  r"""
@@ -362,7 +362,7 @@ def _(expressions_df, pl):
362
  return
363
 
364
 
365
- @app.cell
366
  def _(mo):
367
  mo.md(
368
  r"""
@@ -388,7 +388,7 @@ def _(expressions_df, pl):
388
  return
389
 
390
 
391
- @app.cell
392
  def _(mo):
393
  mo.md(
394
  r"""
@@ -412,7 +412,7 @@ def _(expressions_df, pl):
412
  return
413
 
414
 
415
- @app.cell
416
  def _(mo):
417
  mo.md(
418
  r"""
@@ -434,7 +434,7 @@ def _(expressions_df, pl):
434
  return
435
 
436
 
437
- @app.cell
438
  def _(mo):
439
  mo.md(
440
  r"""
@@ -460,7 +460,7 @@ def _(expressions_df, pl):
460
  return
461
 
462
 
463
- @app.cell
464
  def _(mo):
465
  mo.md(r"""For scenarios where we want to combine multiple substrings to check for, we can use the [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) expression to check for the presence of various patterns.""")
466
  return
@@ -476,7 +476,7 @@ def _(expressions_df, pl):
476
  return
477
 
478
 
479
- @app.cell
480
  def _(mo):
481
  mo.md(
482
  r"""
@@ -506,7 +506,7 @@ def _(expressions_df, pl):
506
  return
507
 
508
 
509
- @app.cell
510
  def _(mo):
511
  mo.md(r"""A related application example is to *find* the first index where a particular pattern is present, so that it can be used for downstream processing such as slicing. Below we use the [`find`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.find.html) expression to determine the index at which a code example starts in the docstring - identified by the Python shell substring `">>>"`.""")
512
  return
@@ -522,7 +522,7 @@ def _(expressions_df, pl):
522
  return
523
 
524
 
525
- @app.cell
526
  def _(mo):
527
  mo.md(
528
  r"""
@@ -562,7 +562,7 @@ def _(mo, slice, sliced_df):
562
  return
563
 
564
 
565
- @app.cell
566
  def _(mo):
567
  mo.md(
568
  r"""
@@ -589,7 +589,7 @@ def _(expressions_df, pl):
589
  return
590
 
591
 
592
- @app.cell
593
  def _(mo):
594
  mo.md(r"""As a more practical example, we can use the `split` expression with some aggregation to count the number of times a particular word occurs in member names across all namespaces. This enables us to create a word cloud of the API members' constituents!""")
595
  return
@@ -643,7 +643,7 @@ def _(alt, expressions_df, pl, random, wordcloud_height, wordcloud_width):
643
  return wordcloud, wordcloud_df
644
 
645
 
646
- @app.cell
647
  def _(mo):
648
  mo.md(
649
  r"""
@@ -677,7 +677,7 @@ def _(expressions_df, pl):
677
  return (descriptions_df,)
678
 
679
 
680
- @app.cell
681
  def _(mo):
682
  mo.md(
683
  r"""
@@ -706,7 +706,7 @@ def _(descriptions_df, mo, pl):
706
  return
707
 
708
 
709
- @app.cell
710
  def _(mo):
711
  mo.md(
712
  r"""
@@ -734,7 +734,7 @@ def _(expressions_df, pl):
734
  return (url_pattern,)
735
 
736
 
737
- @app.cell
738
  def _(mo):
739
  mo.md(
740
  r"""
@@ -758,7 +758,7 @@ def _(expressions_df, pl):
758
  return
759
 
760
 
761
- @app.cell
762
  def _(mo):
763
  mo.md(
764
  r"""
@@ -783,7 +783,7 @@ def _(expressions_df, pl):
783
  return
784
 
785
 
786
- @app.cell
787
  def _(mo):
788
  mo.md(
789
  r"""
@@ -807,7 +807,7 @@ def _(expressions_df, pl):
807
  return
808
 
809
 
810
- @app.cell
811
  def _(mo):
812
  mo.md(
813
  r"""
@@ -830,7 +830,7 @@ def _(expressions_df, pl):
830
  return (encoded_df,)
831
 
832
 
833
- @app.cell
834
  def _(mo):
835
  mo.md(r"""And of course, you can convert back into a human-readable representation using the [`decode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.decode.html) expression.""")
836
  return
@@ -845,7 +845,7 @@ def _(encoded_df, pl):
845
  return
846
 
847
 
848
- @app.cell
849
  def _(mo):
850
  mo.md(
851
  r"""
 
14
  app = marimo.App(width="medium")
15
 
16
 
17
+ @app.cell(hide_code=True)
18
  def _(mo):
19
  mo.md(
20
  r"""
 
30
  return
31
 
32
 
33
+ @app.cell(hide_code=True)
34
  def _(mo):
35
  mo.md(
36
  r"""
 
43
  return
44
 
45
 
46
+ @app.cell
47
  def _(pl):
48
  pip_metadata_raw_df = pl.DataFrame(
49
  [
 
56
  return (pip_metadata_raw_df,)
57
 
58
 
59
+ @app.cell(hide_code=True)
60
  def _(mo):
61
  mo.md(r"""We can use the [`json_decode`](https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.json_decode.html) expression to parse the raw JSON strings into Polars-native structs and we can use the [unnest](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html) dataframe operation to have a dedicated column per parsed attribute.""")
62
  return
 
69
  return (pip_metadata_df,)
70
 
71
 
72
+ @app.cell(hide_code=True)
73
  def _(mo):
74
  mo.md(r"""This is already a much friendlier representation of the data we started out with, but note that since the JSON entries had only string attributes, all values are strings, even the temporal `released_at` and numerical `size_mb` columns.""")
75
  return
76
 
77
 
78
+ @app.cell(hide_code=True)
79
  def _(mo):
80
  mo.md(r"""As we know that the `size_mb` column should have a decimal representation, we go ahead and use [`to_decimal`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_decimal.html#polars.Expr.str.to_decimal) to perform the conversion.""")
81
  return
 
91
  return
92
 
93
 
94
+ @app.cell(hide_code=True)
95
  def _(mo):
96
  mo.md(
97
  r"""
 
127
  return
128
 
129
 
130
+ @app.cell(hide_code=True)
131
  def _(mo):
132
  mo.md(r"""Alternatively, instead of using three different functions to perform the conversion to date, we can use a single one, [`strptime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strptime.html) which takes the desired temporal data type as its first parameter.""")
133
  return
 
145
  return
146
 
147
 
148
+ @app.cell(hide_code=True)
149
  def _(mo):
150
  mo.md(r"""And to wrap up this section on parsing and conversion, let's consider a final scenario. What if we don't want to parse the entire raw JSON string, because we only need a subset of its attributes? Well, in this case we can leverage the [`json_path_match`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.json_path_match.html) expression to extract only the desired attributes using standard [JSONPath](https://goessner.net/articles/JsonPath/) syntax.""")
151
  return
 
163
  return
164
 
165
 
166
+ @app.cell(hide_code=True)
167
  def _(mo):
168
  mo.md(
169
  r"""
 
217
  return expressions_df, list_expr_meta, list_members
218
 
219
 
220
+ @app.cell(hide_code=True)
221
  def _(mo):
222
  mo.md(r"""As the following visualization shows, `str` is one of the richest Polars expression namespaces with multiple dozens of functions in it.""")
223
  return
 
232
  return
233
 
234
 
235
+ @app.cell(hide_code=True)
236
  def _(mo):
237
  mo.md(
238
  r"""
 
260
  return (docstring_length_df,)
261
 
262
 
263
+ @app.cell(hide_code=True)
264
  def _(mo):
265
  mo.md(r"""As the dataframe preview above and the scatterplot below show, the docstring length measured in bytes is almost always bigger than the length expressed in characters. This is due to the fact that the docstrings include characters which require more than a single byte to represent, such as "╞" for displaying dataframe header and body separators.""")
266
  return
 
276
  return
277
 
278
 
279
+ @app.cell(hide_code=True)
280
  def _(mo):
281
  mo.md(
282
  r"""
 
298
  return
299
 
300
 
301
+ @app.cell(hide_code=True)
302
  def _(mo):
303
  mo.md(
304
  r"""
 
338
  return
339
 
340
 
341
+ @app.cell(hide_code=True)
342
  def _(mo):
343
  mo.md(
344
  r"""
 
362
  return
363
 
364
 
365
+ @app.cell(hide_code=True)
366
  def _(mo):
367
  mo.md(
368
  r"""
 
388
  return
389
 
390
 
391
+ @app.cell(hide_code=True)
392
  def _(mo):
393
  mo.md(
394
  r"""
 
412
  return
413
 
414
 
415
+ @app.cell(hide_code=True)
416
  def _(mo):
417
  mo.md(
418
  r"""
 
434
  return
435
 
436
 
437
+ @app.cell(hide_code=True)
438
  def _(mo):
439
  mo.md(
440
  r"""
 
460
  return
461
 
462
 
463
+ @app.cell(hide_code=True)
464
  def _(mo):
465
  mo.md(r"""For scenarios where we want to combine multiple substrings to check for, we can use the [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) expression to check for the presence of various patterns.""")
466
  return
 
476
  return
477
 
478
 
479
+ @app.cell(hide_code=True)
480
  def _(mo):
481
  mo.md(
482
  r"""
 
506
  return
507
 
508
 
509
+ @app.cell(hide_code=True)
510
  def _(mo):
511
  mo.md(r"""A related application example is to *find* the first index where a particular pattern is present, so that it can be used for downstream processing such as slicing. Below we use the [`find`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.find.html) expression to determine the index at which a code example starts in the docstring - identified by the Python shell substring `">>>"`.""")
512
  return
 
522
  return
523
 
524
 
525
+ @app.cell(hide_code=True)
526
  def _(mo):
527
  mo.md(
528
  r"""
 
562
  return
563
 
564
 
565
+ @app.cell(hide_code=True)
566
  def _(mo):
567
  mo.md(
568
  r"""
 
589
  return
590
 
591
 
592
+ @app.cell(hide_code=True)
593
  def _(mo):
594
  mo.md(r"""As a more practical example, we can use the `split` expression with some aggregation to count the number of times a particular word occurs in member names across all namespaces. This enables us to create a word cloud of the API members' constituents!""")
595
  return
 
643
  return wordcloud, wordcloud_df
644
 
645
 
646
+ @app.cell(hide_code=True)
647
  def _(mo):
648
  mo.md(
649
  r"""
 
677
  return (descriptions_df,)
678
 
679
 
680
+ @app.cell(hide_code=True)
681
  def _(mo):
682
  mo.md(
683
  r"""
 
706
  return
707
 
708
 
709
+ @app.cell(hide_code=True)
710
  def _(mo):
711
  mo.md(
712
  r"""
 
734
  return (url_pattern,)
735
 
736
 
737
+ @app.cell(hide_code=True)
738
  def _(mo):
739
  mo.md(
740
  r"""
 
758
  return
759
 
760
 
761
+ @app.cell(hide_code=True)
762
  def _(mo):
763
  mo.md(
764
  r"""
 
783
  return
784
 
785
 
786
+ @app.cell(hide_code=True)
787
  def _(mo):
788
  mo.md(
789
  r"""
 
807
  return
808
 
809
 
810
+ @app.cell(hide_code=True)
811
  def _(mo):
812
  mo.md(
813
  r"""
 
830
  return (encoded_df,)
831
 
832
 
833
+ @app.cell(hide_code=True)
834
  def _(mo):
835
  mo.md(r"""And of course, you can convert back into a human-readable representation using the [`decode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.decode.html) expression.""")
836
  return
 
845
  return
846
 
847
 
848
+ @app.cell(hide_code=True)
849
  def _(mo):
850
  mo.md(
851
  r"""
polars/README.md CHANGED
@@ -23,3 +23,4 @@ You can also open notebooks in our online playground by appending marimo.app/ to
23
  Thanks to all our notebook authors!
24
 
25
  * [Koushik Khan](https://github.com/koushikkhan)
 
 
23
  Thanks to all our notebook authors!
24
 
25
  * [Koushik Khan](https://github.com/koushikkhan)
26
+ * [Péter Gyarmati](https://github.com/peter-gy)