Spaces:
Running
Running
Akshay Agrawal
commited on
Commit
·
5e743f8
1
Parent(s):
7b36ccd
author: attribute peter-gy
Browse filesAnd nit for hiding markdown code.
- polars/09_strings.py +35 -35
- polars/README.md +1 -0
polars/09_strings.py
CHANGED
@@ -14,7 +14,7 @@ __generated_with = "0.11.17"
|
|
14 |
app = marimo.App(width="medium")
|
15 |
|
16 |
|
17 |
-
@app.cell
|
18 |
def _(mo):
|
19 |
mo.md(
|
20 |
r"""
|
@@ -30,7 +30,7 @@ def _(mo):
|
|
30 |
return
|
31 |
|
32 |
|
33 |
-
@app.cell
|
34 |
def _(mo):
|
35 |
mo.md(
|
36 |
r"""
|
@@ -43,7 +43,7 @@ def _(mo):
|
|
43 |
return
|
44 |
|
45 |
|
46 |
-
@app.cell
|
47 |
def _(pl):
|
48 |
pip_metadata_raw_df = pl.DataFrame(
|
49 |
[
|
@@ -56,7 +56,7 @@ def _(pl):
|
|
56 |
return (pip_metadata_raw_df,)
|
57 |
|
58 |
|
59 |
-
@app.cell
|
60 |
def _(mo):
|
61 |
mo.md(r"""We can use the [`json_decode`](https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.json_decode.html) expression to parse the raw JSON strings into Polars-native structs and we can use the [unnest](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html) dataframe operation to have a dedicated column per parsed attribute.""")
|
62 |
return
|
@@ -69,13 +69,13 @@ def _(pip_metadata_raw_df, pl):
|
|
69 |
return (pip_metadata_df,)
|
70 |
|
71 |
|
72 |
-
@app.cell
|
73 |
def _(mo):
|
74 |
mo.md(r"""This is already a much friendlier representation of the data we started out with, but note that since the JSON entries had only string attributes, all values are strings, even the temporal `released_at` and numerical `size_mb` columns.""")
|
75 |
return
|
76 |
|
77 |
|
78 |
-
@app.cell
|
79 |
def _(mo):
|
80 |
mo.md(r"""As we know that the `size_mb` column should have a decimal representation, we go ahead and use [`to_decimal`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_decimal.html#polars.Expr.str.to_decimal) to perform the conversion.""")
|
81 |
return
|
@@ -91,7 +91,7 @@ def _(pip_metadata_df, pl):
|
|
91 |
return
|
92 |
|
93 |
|
94 |
-
@app.cell
|
95 |
def _(mo):
|
96 |
mo.md(
|
97 |
r"""
|
@@ -127,7 +127,7 @@ def _(pip_metadata_df, pl):
|
|
127 |
return
|
128 |
|
129 |
|
130 |
-
@app.cell
|
131 |
def _(mo):
|
132 |
mo.md(r"""Alternatively, instead of using three different functions to perform the conversion to date, we can use a single one, [`strptime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strptime.html) which takes the desired temporal data type as its first parameter.""")
|
133 |
return
|
@@ -145,7 +145,7 @@ def _(pip_metadata_df, pl):
|
|
145 |
return
|
146 |
|
147 |
|
148 |
-
@app.cell
|
149 |
def _(mo):
|
150 |
mo.md(r"""And to wrap up this section on parsing and conversion, let's consider a final scenario. What if we don't want to parse the entire raw JSON string, because we only need a subset of its attributes? Well, in this case we can leverage the [`json_path_match`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.json_path_match.html) expression to extract only the desired attributes using standard [JSONPath](https://goessner.net/articles/JsonPath/) syntax.""")
|
151 |
return
|
@@ -163,7 +163,7 @@ def _(pip_metadata_raw_df, pl):
|
|
163 |
return
|
164 |
|
165 |
|
166 |
-
@app.cell
|
167 |
def _(mo):
|
168 |
mo.md(
|
169 |
r"""
|
@@ -217,7 +217,7 @@ def _(pl):
|
|
217 |
return expressions_df, list_expr_meta, list_members
|
218 |
|
219 |
|
220 |
-
@app.cell
|
221 |
def _(mo):
|
222 |
mo.md(r"""As the following visualization shows, `str` is one of the richest Polars expression namespaces with multiple dozens of functions in it.""")
|
223 |
return
|
@@ -232,7 +232,7 @@ def _(alt, expressions_df):
|
|
232 |
return
|
233 |
|
234 |
|
235 |
-
@app.cell
|
236 |
def _(mo):
|
237 |
mo.md(
|
238 |
r"""
|
@@ -260,7 +260,7 @@ def _(expressions_df, pl):
|
|
260 |
return (docstring_length_df,)
|
261 |
|
262 |
|
263 |
-
@app.cell
|
264 |
def _(mo):
|
265 |
mo.md(r"""As the dataframe preview above and the scatterplot below show, the docstring length measured in bytes is almost always bigger than the length expressed in characters. This is due to the fact that the docstrings include characters which require more than a single byte to represent, such as "╞" for displaying dataframe header and body separators.""")
|
266 |
return
|
@@ -276,7 +276,7 @@ def _(alt, docstring_length_df):
|
|
276 |
return
|
277 |
|
278 |
|
279 |
-
@app.cell
|
280 |
def _(mo):
|
281 |
mo.md(
|
282 |
r"""
|
@@ -298,7 +298,7 @@ def _(expressions_df, pl):
|
|
298 |
return
|
299 |
|
300 |
|
301 |
-
@app.cell
|
302 |
def _(mo):
|
303 |
mo.md(
|
304 |
r"""
|
@@ -338,7 +338,7 @@ def _(mo, padded_df, padding):
|
|
338 |
return
|
339 |
|
340 |
|
341 |
-
@app.cell
|
342 |
def _(mo):
|
343 |
mo.md(
|
344 |
r"""
|
@@ -362,7 +362,7 @@ def _(expressions_df, pl):
|
|
362 |
return
|
363 |
|
364 |
|
365 |
-
@app.cell
|
366 |
def _(mo):
|
367 |
mo.md(
|
368 |
r"""
|
@@ -388,7 +388,7 @@ def _(expressions_df, pl):
|
|
388 |
return
|
389 |
|
390 |
|
391 |
-
@app.cell
|
392 |
def _(mo):
|
393 |
mo.md(
|
394 |
r"""
|
@@ -412,7 +412,7 @@ def _(expressions_df, pl):
|
|
412 |
return
|
413 |
|
414 |
|
415 |
-
@app.cell
|
416 |
def _(mo):
|
417 |
mo.md(
|
418 |
r"""
|
@@ -434,7 +434,7 @@ def _(expressions_df, pl):
|
|
434 |
return
|
435 |
|
436 |
|
437 |
-
@app.cell
|
438 |
def _(mo):
|
439 |
mo.md(
|
440 |
r"""
|
@@ -460,7 +460,7 @@ def _(expressions_df, pl):
|
|
460 |
return
|
461 |
|
462 |
|
463 |
-
@app.cell
|
464 |
def _(mo):
|
465 |
mo.md(r"""For scenarios where we want to combine multiple substrings to check for, we can use the [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) expression to check for the presence of various patterns.""")
|
466 |
return
|
@@ -476,7 +476,7 @@ def _(expressions_df, pl):
|
|
476 |
return
|
477 |
|
478 |
|
479 |
-
@app.cell
|
480 |
def _(mo):
|
481 |
mo.md(
|
482 |
r"""
|
@@ -506,7 +506,7 @@ def _(expressions_df, pl):
|
|
506 |
return
|
507 |
|
508 |
|
509 |
-
@app.cell
|
510 |
def _(mo):
|
511 |
mo.md(r"""A related application example is to *find* the first index where a particular pattern is present, so that it can be used for downstream processing such as slicing. Below we use the [`find`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.find.html) expression to determine the index at which a code example starts in the docstring - identified by the Python shell substring `">>>"`.""")
|
512 |
return
|
@@ -522,7 +522,7 @@ def _(expressions_df, pl):
|
|
522 |
return
|
523 |
|
524 |
|
525 |
-
@app.cell
|
526 |
def _(mo):
|
527 |
mo.md(
|
528 |
r"""
|
@@ -562,7 +562,7 @@ def _(mo, slice, sliced_df):
|
|
562 |
return
|
563 |
|
564 |
|
565 |
-
@app.cell
|
566 |
def _(mo):
|
567 |
mo.md(
|
568 |
r"""
|
@@ -589,7 +589,7 @@ def _(expressions_df, pl):
|
|
589 |
return
|
590 |
|
591 |
|
592 |
-
@app.cell
|
593 |
def _(mo):
|
594 |
mo.md(r"""As a more practical example, we can use the `split` expression with some aggregation to count the number of times a particular word occurs in member names across all namespaces. This enables us to create a word cloud of the API members' constituents!""")
|
595 |
return
|
@@ -643,7 +643,7 @@ def _(alt, expressions_df, pl, random, wordcloud_height, wordcloud_width):
|
|
643 |
return wordcloud, wordcloud_df
|
644 |
|
645 |
|
646 |
-
@app.cell
|
647 |
def _(mo):
|
648 |
mo.md(
|
649 |
r"""
|
@@ -677,7 +677,7 @@ def _(expressions_df, pl):
|
|
677 |
return (descriptions_df,)
|
678 |
|
679 |
|
680 |
-
@app.cell
|
681 |
def _(mo):
|
682 |
mo.md(
|
683 |
r"""
|
@@ -706,7 +706,7 @@ def _(descriptions_df, mo, pl):
|
|
706 |
return
|
707 |
|
708 |
|
709 |
-
@app.cell
|
710 |
def _(mo):
|
711 |
mo.md(
|
712 |
r"""
|
@@ -734,7 +734,7 @@ def _(expressions_df, pl):
|
|
734 |
return (url_pattern,)
|
735 |
|
736 |
|
737 |
-
@app.cell
|
738 |
def _(mo):
|
739 |
mo.md(
|
740 |
r"""
|
@@ -758,7 +758,7 @@ def _(expressions_df, pl):
|
|
758 |
return
|
759 |
|
760 |
|
761 |
-
@app.cell
|
762 |
def _(mo):
|
763 |
mo.md(
|
764 |
r"""
|
@@ -783,7 +783,7 @@ def _(expressions_df, pl):
|
|
783 |
return
|
784 |
|
785 |
|
786 |
-
@app.cell
|
787 |
def _(mo):
|
788 |
mo.md(
|
789 |
r"""
|
@@ -807,7 +807,7 @@ def _(expressions_df, pl):
|
|
807 |
return
|
808 |
|
809 |
|
810 |
-
@app.cell
|
811 |
def _(mo):
|
812 |
mo.md(
|
813 |
r"""
|
@@ -830,7 +830,7 @@ def _(expressions_df, pl):
|
|
830 |
return (encoded_df,)
|
831 |
|
832 |
|
833 |
-
@app.cell
|
834 |
def _(mo):
|
835 |
mo.md(r"""And of course, you can convert back into a human-readable representation using the [`decode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.decode.html) expression.""")
|
836 |
return
|
@@ -845,7 +845,7 @@ def _(encoded_df, pl):
|
|
845 |
return
|
846 |
|
847 |
|
848 |
-
@app.cell
|
849 |
def _(mo):
|
850 |
mo.md(
|
851 |
r"""
|
|
|
14 |
app = marimo.App(width="medium")
|
15 |
|
16 |
|
17 |
+
@app.cell(hide_code=True)
|
18 |
def _(mo):
|
19 |
mo.md(
|
20 |
r"""
|
|
|
30 |
return
|
31 |
|
32 |
|
33 |
+
@app.cell(hide_code=True)
|
34 |
def _(mo):
|
35 |
mo.md(
|
36 |
r"""
|
|
|
43 |
return
|
44 |
|
45 |
|
46 |
+
@app.cell
|
47 |
def _(pl):
|
48 |
pip_metadata_raw_df = pl.DataFrame(
|
49 |
[
|
|
|
56 |
return (pip_metadata_raw_df,)
|
57 |
|
58 |
|
59 |
+
@app.cell(hide_code=True)
|
60 |
def _(mo):
|
61 |
mo.md(r"""We can use the [`json_decode`](https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.json_decode.html) expression to parse the raw JSON strings into Polars-native structs and we can use the [unnest](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html) dataframe operation to have a dedicated column per parsed attribute.""")
|
62 |
return
|
|
|
69 |
return (pip_metadata_df,)
|
70 |
|
71 |
|
72 |
+
@app.cell(hide_code=True)
|
73 |
def _(mo):
|
74 |
mo.md(r"""This is already a much friendlier representation of the data we started out with, but note that since the JSON entries had only string attributes, all values are strings, even the temporal `released_at` and numerical `size_mb` columns.""")
|
75 |
return
|
76 |
|
77 |
|
78 |
+
@app.cell(hide_code=True)
|
79 |
def _(mo):
|
80 |
mo.md(r"""As we know that the `size_mb` column should have a decimal representation, we go ahead and use [`to_decimal`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_decimal.html#polars.Expr.str.to_decimal) to perform the conversion.""")
|
81 |
return
|
|
|
91 |
return
|
92 |
|
93 |
|
94 |
+
@app.cell(hide_code=True)
|
95 |
def _(mo):
|
96 |
mo.md(
|
97 |
r"""
|
|
|
127 |
return
|
128 |
|
129 |
|
130 |
+
@app.cell(hide_code=True)
|
131 |
def _(mo):
|
132 |
mo.md(r"""Alternatively, instead of using three different functions to perform the conversion to date, we can use a single one, [`strptime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strptime.html) which takes the desired temporal data type as its first parameter.""")
|
133 |
return
|
|
|
145 |
return
|
146 |
|
147 |
|
148 |
+
@app.cell(hide_code=True)
|
149 |
def _(mo):
|
150 |
mo.md(r"""And to wrap up this section on parsing and conversion, let's consider a final scenario. What if we don't want to parse the entire raw JSON string, because we only need a subset of its attributes? Well, in this case we can leverage the [`json_path_match`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.json_path_match.html) expression to extract only the desired attributes using standard [JSONPath](https://goessner.net/articles/JsonPath/) syntax.""")
|
151 |
return
|
|
|
163 |
return
|
164 |
|
165 |
|
166 |
+
@app.cell(hide_code=True)
|
167 |
def _(mo):
|
168 |
mo.md(
|
169 |
r"""
|
|
|
217 |
return expressions_df, list_expr_meta, list_members
|
218 |
|
219 |
|
220 |
+
@app.cell(hide_code=True)
|
221 |
def _(mo):
|
222 |
mo.md(r"""As the following visualization shows, `str` is one of the richest Polars expression namespaces with multiple dozens of functions in it.""")
|
223 |
return
|
|
|
232 |
return
|
233 |
|
234 |
|
235 |
+
@app.cell(hide_code=True)
|
236 |
def _(mo):
|
237 |
mo.md(
|
238 |
r"""
|
|
|
260 |
return (docstring_length_df,)
|
261 |
|
262 |
|
263 |
+
@app.cell(hide_code=True)
|
264 |
def _(mo):
|
265 |
mo.md(r"""As the dataframe preview above and the scatterplot below show, the docstring length measured in bytes is almost always bigger than the length expressed in characters. This is due to the fact that the docstrings include characters which require more than a single byte to represent, such as "╞" for displaying dataframe header and body separators.""")
|
266 |
return
|
|
|
276 |
return
|
277 |
|
278 |
|
279 |
+
@app.cell(hide_code=True)
|
280 |
def _(mo):
|
281 |
mo.md(
|
282 |
r"""
|
|
|
298 |
return
|
299 |
|
300 |
|
301 |
+
@app.cell(hide_code=True)
|
302 |
def _(mo):
|
303 |
mo.md(
|
304 |
r"""
|
|
|
338 |
return
|
339 |
|
340 |
|
341 |
+
@app.cell(hide_code=True)
|
342 |
def _(mo):
|
343 |
mo.md(
|
344 |
r"""
|
|
|
362 |
return
|
363 |
|
364 |
|
365 |
+
@app.cell(hide_code=True)
|
366 |
def _(mo):
|
367 |
mo.md(
|
368 |
r"""
|
|
|
388 |
return
|
389 |
|
390 |
|
391 |
+
@app.cell(hide_code=True)
|
392 |
def _(mo):
|
393 |
mo.md(
|
394 |
r"""
|
|
|
412 |
return
|
413 |
|
414 |
|
415 |
+
@app.cell(hide_code=True)
|
416 |
def _(mo):
|
417 |
mo.md(
|
418 |
r"""
|
|
|
434 |
return
|
435 |
|
436 |
|
437 |
+
@app.cell(hide_code=True)
|
438 |
def _(mo):
|
439 |
mo.md(
|
440 |
r"""
|
|
|
460 |
return
|
461 |
|
462 |
|
463 |
+
@app.cell(hide_code=True)
|
464 |
def _(mo):
|
465 |
mo.md(r"""For scenarios where we want to combine multiple substrings to check for, we can use the [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) expression to check for the presence of various patterns.""")
|
466 |
return
|
|
|
476 |
return
|
477 |
|
478 |
|
479 |
+
@app.cell(hide_code=True)
|
480 |
def _(mo):
|
481 |
mo.md(
|
482 |
r"""
|
|
|
506 |
return
|
507 |
|
508 |
|
509 |
+
@app.cell(hide_code=True)
|
510 |
def _(mo):
|
511 |
mo.md(r"""A related application example is to *find* the first index where a particular pattern is present, so that it can be used for downstream processing such as slicing. Below we use the [`find`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.find.html) expression to determine the index at which a code example starts in the docstring - identified by the Python shell substring `">>>"`.""")
|
512 |
return
|
|
|
522 |
return
|
523 |
|
524 |
|
525 |
+
@app.cell(hide_code=True)
|
526 |
def _(mo):
|
527 |
mo.md(
|
528 |
r"""
|
|
|
562 |
return
|
563 |
|
564 |
|
565 |
+
@app.cell(hide_code=True)
|
566 |
def _(mo):
|
567 |
mo.md(
|
568 |
r"""
|
|
|
589 |
return
|
590 |
|
591 |
|
592 |
+
@app.cell(hide_code=True)
|
593 |
def _(mo):
|
594 |
mo.md(r"""As a more practical example, we can use the `split` expression with some aggregation to count the number of times a particular word occurs in member names across all namespaces. This enables us to create a word cloud of the API members' constituents!""")
|
595 |
return
|
|
|
643 |
return wordcloud, wordcloud_df
|
644 |
|
645 |
|
646 |
+
@app.cell(hide_code=True)
|
647 |
def _(mo):
|
648 |
mo.md(
|
649 |
r"""
|
|
|
677 |
return (descriptions_df,)
|
678 |
|
679 |
|
680 |
+
@app.cell(hide_code=True)
|
681 |
def _(mo):
|
682 |
mo.md(
|
683 |
r"""
|
|
|
706 |
return
|
707 |
|
708 |
|
709 |
+
@app.cell(hide_code=True)
|
710 |
def _(mo):
|
711 |
mo.md(
|
712 |
r"""
|
|
|
734 |
return (url_pattern,)
|
735 |
|
736 |
|
737 |
+
@app.cell(hide_code=True)
|
738 |
def _(mo):
|
739 |
mo.md(
|
740 |
r"""
|
|
|
758 |
return
|
759 |
|
760 |
|
761 |
+
@app.cell(hide_code=True)
|
762 |
def _(mo):
|
763 |
mo.md(
|
764 |
r"""
|
|
|
783 |
return
|
784 |
|
785 |
|
786 |
+
@app.cell(hide_code=True)
|
787 |
def _(mo):
|
788 |
mo.md(
|
789 |
r"""
|
|
|
807 |
return
|
808 |
|
809 |
|
810 |
+
@app.cell(hide_code=True)
|
811 |
def _(mo):
|
812 |
mo.md(
|
813 |
r"""
|
|
|
830 |
return (encoded_df,)
|
831 |
|
832 |
|
833 |
+
@app.cell(hide_code=True)
|
834 |
def _(mo):
|
835 |
mo.md(r"""And of course, you can convert back into a human-readable representation using the [`decode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.decode.html) expression.""")
|
836 |
return
|
|
|
845 |
return
|
846 |
|
847 |
|
848 |
+
@app.cell(hide_code=True)
|
849 |
def _(mo):
|
850 |
mo.md(
|
851 |
r"""
|
polars/README.md
CHANGED
@@ -23,3 +23,4 @@ You can also open notebooks in our online playground by appending marimo.app/ to
|
|
23 |
Thanks to all our notebook authors!
|
24 |
|
25 |
* [Koushik Khan](https://github.com/koushikkhan)
|
|
|
|
23 |
Thanks to all our notebook authors!
|
24 |
|
25 |
* [Koushik Khan](https://github.com/koushikkhan)
|
26 |
+
* [Péter Gyarmati](https://github.com/peter-gy)
|