Spaces:
Sleeping
Sleeping
Merge remote-tracking branch 'upstream/main' into fp/functors
Browse files- .github/workflows/typos.yaml +0 -1
- optimization/05_portfolio_optimization.py +1 -1
- polars/10_strings.py +1004 -0
- polars/14_user_defined_functions.py +946 -0
- polars/README.md +1 -0
- probability/08_bayes_theorem.py +1 -1
.github/workflows/typos.yaml
CHANGED
@@ -13,4 +13,3 @@ jobs:
|
|
13 |
uses: styfle/[email protected]
|
14 |
- uses: actions/checkout@v4
|
15 |
- uses: crate-ci/[email protected]
|
16 |
-
name: Tests
|
|
|
13 |
uses: styfle/[email protected]
|
14 |
- uses: actions/checkout@v4
|
15 |
- uses: crate-ci/[email protected]
|
|
optimization/05_portfolio_optimization.py
CHANGED
@@ -47,7 +47,7 @@ def _(mo):
|
|
47 |
r"""
|
48 |
## Asset returns and risk
|
49 |
|
50 |
-
We will only model investments held for one period. The initial prices are $p_i > 0$. The end of period prices are $p_i^+ >0$. The asset (fractional) returns are $r_i = (p_i^+-p_i)/p_i$. The
|
51 |
|
52 |
A common model is that $r$ is a random variable with mean ${\bf E}r = \mu$ and covariance ${\bf E{(r-\mu)(r-\mu)^T}} = \Sigma$.
|
53 |
It follows that $R$ is a random variable with ${\bf E}R = \mu^T w$ and ${\bf var}(R) = w^T\Sigma w$. In real-world applications, $\mu$ and $\Sigma$ are estimated from data and models, and $w$ is chosen using a library like CVXPY.
|
|
|
47 |
r"""
|
48 |
## Asset returns and risk
|
49 |
|
50 |
+
We will only model investments held for one period. The initial prices are $p_i > 0$. The end of period prices are $p_i^+ >0$. The asset (fractional) returns are $r_i = (p_i^+-p_i)/p_i$. The portfolio (fractional) return is $R = r^Tw$.
|
51 |
|
52 |
A common model is that $r$ is a random variable with mean ${\bf E}r = \mu$ and covariance ${\bf E{(r-\mu)(r-\mu)^T}} = \Sigma$.
|
53 |
It follows that $R$ is a random variable with ${\bf E}R = \mu^T w$ and ${\bf var}(R) = w^T\Sigma w$. In real-world applications, $\mu$ and $\Sigma$ are estimated from data and models, and $w$ is chosen using a library like CVXPY.
|
polars/10_strings.py
ADDED
@@ -0,0 +1,1004 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# requires-python = ">=3.12"
|
3 |
+
# dependencies = [
|
4 |
+
# "altair==5.5.0",
|
5 |
+
# "marimo",
|
6 |
+
# "numpy==2.2.3",
|
7 |
+
# "polars==1.24.0",
|
8 |
+
# ]
|
9 |
+
# ///
|
10 |
+
|
11 |
+
import marimo
|
12 |
+
|
13 |
+
__generated_with = "0.11.17"
|
14 |
+
app = marimo.App(width="medium")
|
15 |
+
|
16 |
+
|
17 |
+
@app.cell(hide_code=True)
|
18 |
+
def _(mo):
|
19 |
+
mo.md(
|
20 |
+
r"""
|
21 |
+
# Strings
|
22 |
+
|
23 |
+
_By [Péter Ferenc Gyarmati](http://github.com/peter-gy)_.
|
24 |
+
|
25 |
+
In this chapter we're going to dig into string manipulation. For a fun twist, we'll be mostly playing around with a dataset that every Polars user has bumped into without really thinking about it—the source code of the `polars` module itself. More precisely, we'll use a dataframe that pulls together all the Polars expressions and their docstrings, giving us a cool, hands-on way to explore the expression API in a truly data-driven manner.
|
26 |
+
|
27 |
+
We'll cover parsing, length calculation, case conversion, and much more, with practical examples and visualizations. Finally, we will combine various techniques you learned in prior chapters to build a fully interactive playground in which you can execute the official code examples of Polars expressions.
|
28 |
+
"""
|
29 |
+
)
|
30 |
+
return
|
31 |
+
|
32 |
+
|
33 |
+
@app.cell(hide_code=True)
|
34 |
+
def _(mo):
|
35 |
+
mo.md(
|
36 |
+
r"""
|
37 |
+
## 🛠️ Parsing & Conversion
|
38 |
+
|
39 |
+
Let's warm up with one of the most frequent use cases: parsing raw strings into various formats.
|
40 |
+
We'll take a tiny dataframe with metadata about Python packages represented as raw JSON strings and we'll use Polars string expressions to parse the attributes into their true data types.
|
41 |
+
"""
|
42 |
+
)
|
43 |
+
return
|
44 |
+
|
45 |
+
|
46 |
+
@app.cell
|
47 |
+
def _(pl):
|
48 |
+
pip_metadata_raw_df = pl.DataFrame(
|
49 |
+
[
|
50 |
+
'{"package": "polars", "version": "1.24.0", "released_at": "2025-03-02T20:31:12+0000", "size_mb": "30.9"}',
|
51 |
+
'{"package": "marimo", "version": "0.11.14", "released_at": "2025-03-04T00:28:57+0000", "size_mb": "10.7"}',
|
52 |
+
],
|
53 |
+
schema={"raw_json": pl.String},
|
54 |
+
)
|
55 |
+
pip_metadata_raw_df
|
56 |
+
return (pip_metadata_raw_df,)
|
57 |
+
|
58 |
+
|
59 |
+
@app.cell(hide_code=True)
|
60 |
+
def _(mo):
|
61 |
+
mo.md(r"""We can use the [`json_decode`](https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.json_decode.html) expression to parse the raw JSON strings into Polars-native structs and we can use the [unnest](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html) dataframe operation to have a dedicated column per parsed attribute.""")
|
62 |
+
return
|
63 |
+
|
64 |
+
|
65 |
+
@app.cell
|
66 |
+
def _(pip_metadata_raw_df, pl):
|
67 |
+
pip_metadata_df = pip_metadata_raw_df.select(json=pl.col('raw_json').str.json_decode()).unnest('json')
|
68 |
+
pip_metadata_df
|
69 |
+
return (pip_metadata_df,)
|
70 |
+
|
71 |
+
|
72 |
+
@app.cell(hide_code=True)
|
73 |
+
def _(mo):
|
74 |
+
mo.md(r"""This is already a much friendlier representation of the data we started out with, but note that since the JSON entries had only string attributes, all values are strings, even the temporal `released_at` and numerical `size_mb` columns.""")
|
75 |
+
return
|
76 |
+
|
77 |
+
|
78 |
+
@app.cell(hide_code=True)
|
79 |
+
def _(mo):
|
80 |
+
mo.md(r"""As we know that the `size_mb` column should have a decimal representation, we go ahead and use [`to_decimal`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_decimal.html#polars.Expr.str.to_decimal) to perform the conversion.""")
|
81 |
+
return
|
82 |
+
|
83 |
+
|
84 |
+
@app.cell
|
85 |
+
def _(pip_metadata_df, pl):
|
86 |
+
pip_metadata_df.select(
|
87 |
+
'package',
|
88 |
+
'version',
|
89 |
+
pl.col('size_mb').str.to_decimal(),
|
90 |
+
)
|
91 |
+
return
|
92 |
+
|
93 |
+
|
94 |
+
@app.cell(hide_code=True)
|
95 |
+
def _(mo):
|
96 |
+
mo.md(
|
97 |
+
r"""
|
98 |
+
Moving on to the `released_at` attribute which indicates the exact time when a given Python package got released, we have a bit more options to consider. We can convert to `Date`, `DateTime`, and `Time` types based on the desired temporal granularity. The [`to_date`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_date.html), [`to_datetime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_datetime.html), and [`to_time`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_time.html) expressions are here to help us with the conversion, all we need is to provide the desired format string.
|
99 |
+
|
100 |
+
Since Polars uses Rust under the hood to implement all its expressions, we need to consult the [`chrono::format`](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) reference to come up with appropriate format strings.
|
101 |
+
|
102 |
+
Here's a quick reference:
|
103 |
+
|
104 |
+
| Specifier | Meaning |
|
105 |
+
|-----------|--------------------|
|
106 |
+
| `%Y` | Year (e.g., 2025) |
|
107 |
+
| `%m` | Month (01-12) |
|
108 |
+
| `%d` | Day (01-31) |
|
109 |
+
| `%H` | Hour (00-23) |
|
110 |
+
| `%z` | UTC offset |
|
111 |
+
|
112 |
+
The raw strings we are working with look like `"2025-03-02T20:31:12+0000"`. We can match this using the `"%Y-%m-%dT%H:%M:%S%z"` format string.
|
113 |
+
"""
|
114 |
+
)
|
115 |
+
return
|
116 |
+
|
117 |
+
|
118 |
+
@app.cell
|
119 |
+
def _(pip_metadata_df, pl):
|
120 |
+
pip_metadata_df.select(
|
121 |
+
'package',
|
122 |
+
'version',
|
123 |
+
release_date=pl.col('released_at').str.to_date('%Y-%m-%dT%H:%M:%S%z'),
|
124 |
+
release_datetime=pl.col('released_at').str.to_datetime('%Y-%m-%dT%H:%M:%S%z'),
|
125 |
+
release_time=pl.col('released_at').str.to_time('%Y-%m-%dT%H:%M:%S%z'),
|
126 |
+
)
|
127 |
+
return
|
128 |
+
|
129 |
+
|
130 |
+
@app.cell(hide_code=True)
|
131 |
+
def _(mo):
|
132 |
+
mo.md(r"""Alternatively, instead of using three different functions to perform the conversion to date, we can use a single one, [`strptime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strptime.html) which takes the desired temporal data type as its first parameter.""")
|
133 |
+
return
|
134 |
+
|
135 |
+
|
136 |
+
@app.cell
|
137 |
+
def _(pip_metadata_df, pl):
|
138 |
+
pip_metadata_df.select(
|
139 |
+
'package',
|
140 |
+
'version',
|
141 |
+
release_date=pl.col('released_at').str.strptime(pl.Date, '%Y-%m-%dT%H:%M:%S%z'),
|
142 |
+
release_datetime=pl.col('released_at').str.strptime(pl.Datetime, '%Y-%m-%dT%H:%M:%S%z'),
|
143 |
+
release_time=pl.col('released_at').str.strptime(pl.Time, '%Y-%m-%dT%H:%M:%S%z'),
|
144 |
+
)
|
145 |
+
return
|
146 |
+
|
147 |
+
|
148 |
+
@app.cell(hide_code=True)
|
149 |
+
def _(mo):
|
150 |
+
mo.md(r"""And to wrap up this section on parsing and conversion, let's consider a final scenario. What if we don't want to parse the entire raw JSON string, because we only need a subset of its attributes? Well, in this case we can leverage the [`json_path_match`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.json_path_match.html) expression to extract only the desired attributes using standard [JSONPath](https://goessner.net/articles/JsonPath/) syntax.""")
|
151 |
+
return
|
152 |
+
|
153 |
+
|
154 |
+
@app.cell
|
155 |
+
def _(pip_metadata_raw_df, pl):
|
156 |
+
pip_metadata_raw_df.select(
|
157 |
+
package=pl.col("raw_json").str.json_path_match("$.package"),
|
158 |
+
version=pl.col("raw_json").str.json_path_match("$.version"),
|
159 |
+
release_date=pl.col("raw_json")
|
160 |
+
.str.json_path_match("$.size_mb")
|
161 |
+
.str.to_decimal(),
|
162 |
+
)
|
163 |
+
return
|
164 |
+
|
165 |
+
|
166 |
+
@app.cell(hide_code=True)
|
167 |
+
def _(mo):
|
168 |
+
mo.md(
|
169 |
+
r"""
|
170 |
+
## 📊 Dataset Overview
|
171 |
+
|
172 |
+
Now that we got our hands dirty, let's consider a somewhat wilder dataset for the subsequent sections: a dataframe of metadata about every single expression in your current Polars module.
|
173 |
+
|
174 |
+
At the risk of stating the obvious, in the previous section, when we typed `pl.col('raw_json').str.json_decode()`, we accessed the `json_decode` member of the `str` expression namespace through the `pl.col('raw_json')` expression *instance*. Under the hood, deep inside the Polars source code, there is a corresponding `def json_decode(...)` method with a carefully authored docstring explaining the purpose and signature of the member.
|
175 |
+
|
176 |
+
Since Python makes module introspection simple, we can easily enumerate all Polars expressions and organize their metadata in `expressions_df`, to be used for all the upcoming string manipulation examples.
|
177 |
+
"""
|
178 |
+
)
|
179 |
+
return
|
180 |
+
|
181 |
+
|
182 |
+
@app.cell(hide_code=True)
|
183 |
+
def _(pl):
|
184 |
+
def list_members(expr, namespace) -> list[dict]:
|
185 |
+
"""Iterates through the attributes of `expr` and returns their metadata"""
|
186 |
+
members = []
|
187 |
+
for attrname in expr.__dir__():
|
188 |
+
is_namespace = attrname in pl.Expr._accessors
|
189 |
+
is_private = attrname.startswith("_")
|
190 |
+
if is_namespace or is_private:
|
191 |
+
continue
|
192 |
+
|
193 |
+
attr = getattr(expr, attrname)
|
194 |
+
members.append(
|
195 |
+
{
|
196 |
+
"namespace": namespace,
|
197 |
+
"member": attrname,
|
198 |
+
"docstring": attr.__doc__,
|
199 |
+
}
|
200 |
+
)
|
201 |
+
return members
|
202 |
+
|
203 |
+
|
204 |
+
def list_expr_meta() -> list[dict]:
|
205 |
+
# Dummy expression instance to 'crawl'
|
206 |
+
expr = pl.lit("")
|
207 |
+
root_members = list_members(expr, "root")
|
208 |
+
namespaced_members: list[list[dict]] = [
|
209 |
+
list_members(getattr(expr, namespace), namespace)
|
210 |
+
for namespace in pl.Expr._accessors
|
211 |
+
]
|
212 |
+
return sum(namespaced_members, root_members)
|
213 |
+
|
214 |
+
|
215 |
+
expressions_df = pl.from_dicts(list_expr_meta(), infer_schema_length=None).sort('namespace', 'member')
|
216 |
+
expressions_df
|
217 |
+
return expressions_df, list_expr_meta, list_members
|
218 |
+
|
219 |
+
|
220 |
+
@app.cell(hide_code=True)
|
221 |
+
def _(mo):
|
222 |
+
mo.md(r"""As the following visualization shows, `str` is one of the richest Polars expression namespaces with multiple dozens of functions in it.""")
|
223 |
+
return
|
224 |
+
|
225 |
+
|
226 |
+
@app.cell(hide_code=True)
|
227 |
+
def _(alt, expressions_df):
|
228 |
+
expressions_df.plot.bar(
|
229 |
+
x=alt.X("count(member):Q", title='Count of Expressions'),
|
230 |
+
y=alt.Y("namespace:N", title='Namespace').sort("-x"),
|
231 |
+
)
|
232 |
+
return
|
233 |
+
|
234 |
+
|
235 |
+
@app.cell(hide_code=True)
|
236 |
+
def _(mo):
|
237 |
+
mo.md(
|
238 |
+
r"""
|
239 |
+
## 📏 Length Calculation
|
240 |
+
|
241 |
+
A common use case is to compute the length of a string. Most people associate string length exclusively with the number of characters the said string consists of; however, in certain scenarios it is useful to also know how much memory is required for storing, so how many bytes are required to represent the textual data.
|
242 |
+
|
243 |
+
The expressions [`len_chars`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_chars.html) and [`len_bytes`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_bytes.html) are here to help us with these calculations.
|
244 |
+
|
245 |
+
Below, we compute `docstring_len_chars` and `docstring_len_bytes` columns to see how many characters and bytes the documentation of each expression is made up of.
|
246 |
+
"""
|
247 |
+
)
|
248 |
+
return
|
249 |
+
|
250 |
+
|
251 |
+
@app.cell
|
252 |
+
def _(expressions_df, pl):
|
253 |
+
docstring_length_df = expressions_df.select(
|
254 |
+
'namespace',
|
255 |
+
'member',
|
256 |
+
docstring_len_chars=pl.col("docstring").str.len_chars(),
|
257 |
+
docstring_len_bytes=pl.col("docstring").str.len_bytes(),
|
258 |
+
)
|
259 |
+
docstring_length_df
|
260 |
+
return (docstring_length_df,)
|
261 |
+
|
262 |
+
|
263 |
+
@app.cell(hide_code=True)
|
264 |
+
def _(mo):
|
265 |
+
mo.md(r"""As the dataframe preview above and the scatterplot below show, the docstring length measured in bytes is almost always bigger than the length expressed in characters. This is due to the fact that the docstrings include characters which require more than a single byte to represent, such as "╞" for displaying dataframe header and body separators.""")
|
266 |
+
return
|
267 |
+
|
268 |
+
|
269 |
+
@app.cell
|
270 |
+
def _(alt, docstring_length_df):
|
271 |
+
docstring_length_df.plot.point(
|
272 |
+
x=alt.X('docstring_len_chars', title='Docstring Length (Chars)'),
|
273 |
+
y=alt.Y('docstring_len_bytes', title='Docstring Length (Bytes)'),
|
274 |
+
tooltip=['namespace', 'member', 'docstring_len_chars', 'docstring_len_bytes'],
|
275 |
+
)
|
276 |
+
return
|
277 |
+
|
278 |
+
|
279 |
+
@app.cell(hide_code=True)
|
280 |
+
def _(mo):
|
281 |
+
mo.md(
|
282 |
+
r"""
|
283 |
+
## 🔠 Case Conversion
|
284 |
+
|
285 |
+
Another frequent string transformation is lowercasing, uppercasing, and titlecasing. We can use [`to_lowercase`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_lowercase.html), [`to_uppercase`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_lowercase.html) and [`to_titlecase`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_titlecase.html) for doing so.
|
286 |
+
"""
|
287 |
+
)
|
288 |
+
return
|
289 |
+
|
290 |
+
|
291 |
+
@app.cell
|
292 |
+
def _(expressions_df, pl):
|
293 |
+
expressions_df.select(
|
294 |
+
member_lower=pl.col('member').str.to_lowercase(),
|
295 |
+
member_upper=pl.col('member').str.to_uppercase(),
|
296 |
+
member_title=pl.col('member').str.to_titlecase(),
|
297 |
+
)
|
298 |
+
return
|
299 |
+
|
300 |
+
|
301 |
+
@app.cell(hide_code=True)
|
302 |
+
def _(mo):
|
303 |
+
mo.md(
|
304 |
+
r"""
|
305 |
+
## ➕ Padding
|
306 |
+
|
307 |
+
Sometimes we need to ensure that strings have a fixed-size character length. [`pad_start`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.pad_start.html) and [`pad_end`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.pad_end.html) can be used to fill the "front" or "back" of a string with a supplied character, while [`zfill`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.zfill.html) is a utility for padding the start of a string with `"0"` until it reaches a particular length. In other words, `zfill` is a more specific version of `pad_start`, where the `fill_char` parameter is explicitly set to `"0"`.
|
308 |
+
|
309 |
+
In the example below we take the unique Polars expression namespaces and pad them so that they have a uniform length which you can control via a slider.
|
310 |
+
"""
|
311 |
+
)
|
312 |
+
return
|
313 |
+
|
314 |
+
|
315 |
+
@app.cell(hide_code=True)
|
316 |
+
def _(mo):
|
317 |
+
padding = mo.ui.slider(0, 16, step=1, value=8, label="Padding Size")
|
318 |
+
return (padding,)
|
319 |
+
|
320 |
+
|
321 |
+
@app.cell
|
322 |
+
def _(expressions_df, padding, pl):
|
323 |
+
padded_df = expressions_df.select("namespace").unique().select(
|
324 |
+
"namespace",
|
325 |
+
namespace_front_padded=pl.col("namespace").str.pad_start(padding.value, "_"),
|
326 |
+
namespace_back_padded=pl.col("namespace").str.pad_end(padding.value, "_"),
|
327 |
+
namespace_zfilled=pl.col("namespace").str.zfill(padding.value),
|
328 |
+
)
|
329 |
+
return (padded_df,)
|
330 |
+
|
331 |
+
|
332 |
+
@app.cell(hide_code=True)
|
333 |
+
def _(mo, padded_df, padding):
|
334 |
+
mo.vstack([
|
335 |
+
padding,
|
336 |
+
padded_df,
|
337 |
+
])
|
338 |
+
return
|
339 |
+
|
340 |
+
|
341 |
+
@app.cell(hide_code=True)
|
342 |
+
def _(mo):
|
343 |
+
mo.md(
|
344 |
+
r"""
|
345 |
+
## 🔄 Replacing
|
346 |
+
|
347 |
+
Let's say we want to convert from `snake_case` API member names to `kebab-case`, that is, we need to replace the underscore character with a hyphen. For operations like that, we can use [`replace`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.replace.html) and [`replace_all`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.replace_all.html).
|
348 |
+
|
349 |
+
As the example below demonstrates, `replace` stops after the first occurrence of the to-be-replaced pattern, while `replace_all` goes all the way through and changes all underscores to hyphens resulting in the `kebab-case` representation we were looking for.
|
350 |
+
"""
|
351 |
+
)
|
352 |
+
return
|
353 |
+
|
354 |
+
|
355 |
+
@app.cell
|
356 |
+
def _(expressions_df, pl):
|
357 |
+
expressions_df.select(
|
358 |
+
"member",
|
359 |
+
member_kebab_case_partial=pl.col("member").str.replace("_", "-"),
|
360 |
+
member_kebab_case=pl.col("member").str.replace_all("_", "-"),
|
361 |
+
).sort(pl.col("member").str.len_chars(), descending=True)
|
362 |
+
return
|
363 |
+
|
364 |
+
|
365 |
+
@app.cell(hide_code=True)
|
366 |
+
def _(mo):
|
367 |
+
mo.md(
|
368 |
+
r"""
|
369 |
+
A related expression is [`replace_many`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.replace_many.html), which accepts *many* pairs of to-be-matched patterns and corresponding replacements and uses the [Aho–Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) to carry out the operation with great performance.
|
370 |
+
|
371 |
+
In the example below we replace all instances of `"min"` with `"minimum"` and `"max"` with `"maximum"` using a single expression.
|
372 |
+
"""
|
373 |
+
)
|
374 |
+
return
|
375 |
+
|
376 |
+
|
377 |
+
@app.cell
|
378 |
+
def _(expressions_df, pl):
|
379 |
+
expressions_df.select(
|
380 |
+
"member",
|
381 |
+
member_modified=pl.col("member").str.replace_many(
|
382 |
+
{
|
383 |
+
"min": "minimum",
|
384 |
+
"max": "maximum",
|
385 |
+
}
|
386 |
+
),
|
387 |
+
)
|
388 |
+
return
|
389 |
+
|
390 |
+
|
391 |
+
@app.cell(hide_code=True)
|
392 |
+
def _(mo):
|
393 |
+
mo.md(
|
394 |
+
r"""
|
395 |
+
## 🔍 Searching & Matching
|
396 |
+
|
397 |
+
A common need when working with strings is to determine whether their content satisfies some condition: whether it starts or ends with a particular substring or contains a certain pattern.
|
398 |
+
|
399 |
+
Let's suppose we want to determine whether a member of the Polars expression API is a "converter", such as `to_decimal`, identified by its `"to_"` prefix. We can use [`starts_with`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.starts_with.html) to perform this check.
|
400 |
+
"""
|
401 |
+
)
|
402 |
+
return
|
403 |
+
|
404 |
+
|
405 |
+
@app.cell
|
406 |
+
def _(expressions_df, pl):
|
407 |
+
expressions_df.select(
|
408 |
+
"namespace",
|
409 |
+
"member",
|
410 |
+
is_converter=pl.col("member").str.starts_with("to_"),
|
411 |
+
).sort(-pl.col("is_converter").cast(pl.Int8))
|
412 |
+
return
|
413 |
+
|
414 |
+
|
415 |
+
@app.cell(hide_code=True)
|
416 |
+
def _(mo):
|
417 |
+
mo.md(
|
418 |
+
r"""
|
419 |
+
Throughout this course as you have gained familiarity with the expression API you might have noticed that some members end with an underscore such as `or_`, since their "body" is a reserved Python keyword.
|
420 |
+
|
421 |
+
Let's use [`ends_with`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.ends_with.html) to find all the members which are named after such keywords.
|
422 |
+
"""
|
423 |
+
)
|
424 |
+
return
|
425 |
+
|
426 |
+
|
427 |
+
@app.cell
|
428 |
+
def _(expressions_df, pl):
|
429 |
+
expressions_df.select(
|
430 |
+
"namespace",
|
431 |
+
"member",
|
432 |
+
is_escaped_keyword=pl.col("member").str.ends_with("_"),
|
433 |
+
).sort(-pl.col("is_escaped_keyword").cast(pl.Int8))
|
434 |
+
return
|
435 |
+
|
436 |
+
|
437 |
+
@app.cell(hide_code=True)
|
438 |
+
def _(mo):
|
439 |
+
mo.md(
|
440 |
+
r"""
|
441 |
+
Now let's move on to analyzing the docstrings in a bit more detail. Based on their content we can determine whether a member is deprecated, accepts parameters, comes with examples, or references external URL(s) & related members.
|
442 |
+
|
443 |
+
As demonstrated below, we can compute all these boolean attributes using [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) to check whether the docstring includes a particular substring.
|
444 |
+
"""
|
445 |
+
)
|
446 |
+
return
|
447 |
+
|
448 |
+
|
449 |
+
@app.cell
|
450 |
+
def _(expressions_df, pl):
|
451 |
+
expressions_df.select(
|
452 |
+
'namespace',
|
453 |
+
'member',
|
454 |
+
is_deprecated=pl.col('docstring').str.contains('.. deprecated', literal=True),
|
455 |
+
has_parameters=pl.col('docstring').str.contains('Parameters'),
|
456 |
+
has_examples=pl.col('docstring').str.contains('Examples'),
|
457 |
+
has_related_members=pl.col('docstring').str.contains('See Also'),
|
458 |
+
has_url=pl.col('docstring').str.contains('https?://'),
|
459 |
+
)
|
460 |
+
return
|
461 |
+
|
462 |
+
|
463 |
+
@app.cell(hide_code=True)
|
464 |
+
def _(mo):
|
465 |
+
mo.md(r"""For scenarios where we want to combine multiple substrings to check for, we can use the [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) expression to check for the presence of various patterns.""")
|
466 |
+
return
|
467 |
+
|
468 |
+
|
469 |
+
@app.cell
|
470 |
+
def _(expressions_df, pl):
|
471 |
+
expressions_df.select(
|
472 |
+
'namespace',
|
473 |
+
'member',
|
474 |
+
has_reference=pl.col('docstring').str.contains_any(['See Also', 'https://'])
|
475 |
+
)
|
476 |
+
return
|
477 |
+
|
478 |
+
|
479 |
+
@app.cell(hide_code=True)
|
480 |
+
def _(mo):
|
481 |
+
mo.md(
|
482 |
+
r"""
|
483 |
+
From the above analysis we could see that almost all the members come with code examples. It would be interesting to know how many variable assignments are going on within each of these examples, right? That's not as simple as checking for a pre-defined literal string containment though, because variables can have arbitrary names - any valid Python identifier is allowed. While the `contains` function supports checking for regular expressions instead of literal strings too, it would not suffice for this exercise because it only tells us whether there is at least a single occurrence of the sought pattern rather than telling us the exact number of matches.
|
484 |
+
|
485 |
+
Fortunately, we can take advantage of [`count_matches`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.count_matches.html) to achieve exactly what we want. We specify the regular expression `r'[a-zA-Z_][a-zA-Z0-9_]* = '` according to the [`regex` Rust crate](https://docs.rs/regex/latest/regex/) to match Python identifiers and we leave the rest to Polars.
|
486 |
+
|
487 |
+
In `count_matches(r'[a-zA-Z_][a-zA-Z0-9_]* = ')`:
|
488 |
+
|
489 |
+
- `[a-zA-Z_]` matches a letter or underscore (start of a Python identifier).
|
490 |
+
- `[a-zA-Z0-9_]*` matches zero or more letters, digits, or underscores.
|
491 |
+
- ` = ` matches a space, equals sign, and space (indicating assignment).
|
492 |
+
|
493 |
+
This finds variable assignments like `x = ` or `df_result = ` in docstrings.
|
494 |
+
"""
|
495 |
+
)
|
496 |
+
return
|
497 |
+
|
498 |
+
|
499 |
+
@app.cell
|
500 |
+
def _(expressions_df, pl):
|
501 |
+
expressions_df.select(
|
502 |
+
'namespace',
|
503 |
+
'member',
|
504 |
+
variable_assignment_count=pl.col('docstring').str.count_matches(r'[a-zA-Z_][a-zA-Z0-9_]* = '),
|
505 |
+
)
|
506 |
+
return
|
507 |
+
|
508 |
+
|
509 |
+
@app.cell(hide_code=True)
|
510 |
+
def _(mo):
|
511 |
+
mo.md(r"""A related application example is to *find* the first index where a particular pattern is present, so that it can be used for downstream processing such as slicing. Below we use the [`find`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.find.html) expression to determine the index at which a code example starts in the docstring - identified by the Python shell substring `">>>"`.""")
|
512 |
+
return
|
513 |
+
|
514 |
+
|
515 |
+
@app.cell
|
516 |
+
def _(expressions_df, pl):
|
517 |
+
expressions_df.select(
|
518 |
+
'namespace',
|
519 |
+
'member',
|
520 |
+
code_example_start=pl.col('docstring').str.find('>>>'),
|
521 |
+
)
|
522 |
+
return
|
523 |
+
|
524 |
+
|
525 |
+
@app.cell(hide_code=True)
|
526 |
+
def _(mo):
|
527 |
+
mo.md(
|
528 |
+
r"""
|
529 |
+
## ✂️ Slicing and Substrings
|
530 |
+
|
531 |
+
Sometimes we are only interested in a particular substring. We can use [`head`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.head.html), [`tail`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.tail.html) and [`slice`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.slice.html) to extract a substring from the start, end, or between arbitrary indices.
|
532 |
+
"""
|
533 |
+
)
|
534 |
+
return
|
535 |
+
|
536 |
+
|
537 |
+
@app.cell
|
538 |
+
def _(mo):
|
539 |
+
slice = mo.ui.slider(1, 50, step=1, value=25, label="Slice Size")
|
540 |
+
return (slice,)
|
541 |
+
|
542 |
+
|
543 |
+
@app.cell
|
544 |
+
def _(expressions_df, pl, slice):
|
545 |
+
sliced_df = expressions_df.select(
|
546 |
+
# First 25 chars
|
547 |
+
docstring_head=pl.col("docstring").str.head(slice.value),
|
548 |
+
# 50 chars after the first 25 chars
|
549 |
+
docstring_slice=pl.col("docstring").str.slice(slice.value, 2*slice.value),
|
550 |
+
# Last 25 chars
|
551 |
+
docstring_tail=pl.col("docstring").str.tail(slice.value),
|
552 |
+
)
|
553 |
+
return (sliced_df,)
|
554 |
+
|
555 |
+
|
556 |
+
@app.cell
|
557 |
+
def _(mo, slice, sliced_df):
|
558 |
+
mo.vstack([
|
559 |
+
slice,
|
560 |
+
sliced_df,
|
561 |
+
])
|
562 |
+
return
|
563 |
+
|
564 |
+
|
565 |
+
@app.cell(hide_code=True)
|
566 |
+
def _(mo):
|
567 |
+
mo.md(
|
568 |
+
r"""
|
569 |
+
## ➗ Splitting
|
570 |
+
|
571 |
+
Certain strings follow a well-defined structure and we might be only interested in some parts of them. For example, when dealing with `snake_cased_expression` member names we might be curious to get only the first, second, or $n^{\text{th}}$ word before an underscore. We would need to *split* the string at a particular pattern for downstream processing.
|
572 |
+
|
573 |
+
The [`split`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.split.html), [`split_exact`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.split_exact.html) and [`splitn`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.splitn.html) expressions enable us to achieve this.
|
574 |
+
|
575 |
+
The primary difference between these string splitting utilities is that `split` produces a list of variadic length based on the number of resulting segments, `splitn` returns a struct with at least `0` and at most `n` fields while `split_exact` returns a struct of exactly `n` fields.
|
576 |
+
"""
|
577 |
+
)
|
578 |
+
return
|
579 |
+
|
580 |
+
|
581 |
+
@app.cell
|
582 |
+
def _(expressions_df, pl):
|
583 |
+
expressions_df.select(
|
584 |
+
'member',
|
585 |
+
member_name_parts=pl.col('member').str.split('_'),
|
586 |
+
member_name_parts_n=pl.col('member').str.splitn('_', n=2),
|
587 |
+
member_name_parts_exact=pl.col('member').str.split_exact('_', n=2),
|
588 |
+
)
|
589 |
+
return
|
590 |
+
|
591 |
+
|
592 |
+
@app.cell(hide_code=True)
|
593 |
+
def _(mo):
|
594 |
+
mo.md(r"""As a more practical example, we can use the `split` expression with some aggregation to count the number of times a particular word occurs in member names across all namespaces. This enables us to create a word cloud of the API members' constituents!""")
|
595 |
+
return
|
596 |
+
|
597 |
+
|
598 |
+
@app.cell(hide_code=True)
|
599 |
+
def _(mo, wordcloud, wordcloud_height, wordcloud_width):
|
600 |
+
mo.vstack([
|
601 |
+
wordcloud_width,
|
602 |
+
wordcloud_height,
|
603 |
+
wordcloud,
|
604 |
+
])
|
605 |
+
return
|
606 |
+
|
607 |
+
|
608 |
+
@app.cell(hide_code=True)
|
609 |
+
def _(mo):
|
610 |
+
wordcloud_width = mo.ui.slider(0, 64, step=1, value=32, label="Word Cloud Width")
|
611 |
+
wordcloud_height = mo.ui.slider(0, 32, step=1, value=16, label="Word Cloud Height")
|
612 |
+
return wordcloud_height, wordcloud_width
|
613 |
+
|
614 |
+
|
615 |
+
@app.cell(hide_code=True)
|
616 |
+
def _(alt, expressions_df, pl, random, wordcloud_height, wordcloud_width):
|
617 |
+
wordcloud_df = (
|
618 |
+
expressions_df.select(pl.col("member").str.split("_"))
|
619 |
+
.explode("member")
|
620 |
+
.group_by("member")
|
621 |
+
.agg(pl.len())
|
622 |
+
# Generating random x and y coordinates to distribute the words in the 2D space
|
623 |
+
.with_columns(
|
624 |
+
x=pl.col("member").map_elements(
|
625 |
+
lambda e: random.randint(0, wordcloud_width.value),
|
626 |
+
return_dtype=pl.UInt8,
|
627 |
+
),
|
628 |
+
y=pl.col("member").map_elements(
|
629 |
+
lambda e: random.randint(0, wordcloud_height.value),
|
630 |
+
return_dtype=pl.UInt8,
|
631 |
+
),
|
632 |
+
)
|
633 |
+
)
|
634 |
+
|
635 |
+
wordcloud = alt.Chart(wordcloud_df).mark_text(baseline="middle").encode(
|
636 |
+
x=alt.X("x:O", axis=None),
|
637 |
+
y=alt.Y("y:O", axis=None),
|
638 |
+
text="member:N",
|
639 |
+
color=alt.Color("len:Q", scale=alt.Scale(scheme="bluepurple")),
|
640 |
+
size=alt.Size("len:Q", legend=None),
|
641 |
+
tooltip=["member", "len"],
|
642 |
+
).configure_view(strokeWidth=0)
|
643 |
+
return wordcloud, wordcloud_df
|
644 |
+
|
645 |
+
|
646 |
+
@app.cell(hide_code=True)
|
647 |
+
def _(mo):
|
648 |
+
mo.md(
|
649 |
+
r"""
|
650 |
+
## 🔗 Concatenation & Joining
|
651 |
+
|
652 |
+
Often we would like to create longer strings from strings we already have. We might want to create a formatted, sentence-like string or join multiple existing strings in our dataframe into a single one.
|
653 |
+
|
654 |
+
The top-level [`concat_str`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_str.html) expression enables us to combine strings *horizontally* in a dataframe. As the example below shows, we can take the `member` and `namespace` column of each row and construct a `description` column in which each row will correspond to the value ``f"- Expression `{member}` belongs to namespace `{namespace}`"``.
|
655 |
+
"""
|
656 |
+
)
|
657 |
+
return
|
658 |
+
|
659 |
+
|
660 |
+
@app.cell
|
661 |
+
def _(expressions_df, pl):
|
662 |
+
descriptions_df = expressions_df.sample(5).select(
|
663 |
+
description=pl.concat_str(
|
664 |
+
[
|
665 |
+
pl.lit("- Expression "),
|
666 |
+
pl.lit("`"),
|
667 |
+
"member",
|
668 |
+
pl.lit("`"),
|
669 |
+
pl.lit(" belongs to namespace "),
|
670 |
+
pl.lit("`"),
|
671 |
+
"namespace",
|
672 |
+
pl.lit("`"),
|
673 |
+
],
|
674 |
+
)
|
675 |
+
)
|
676 |
+
descriptions_df
|
677 |
+
return (descriptions_df,)
|
678 |
+
|
679 |
+
|
680 |
+
@app.cell(hide_code=True)
|
681 |
+
def _(mo):
|
682 |
+
mo.md(
|
683 |
+
r"""
|
684 |
+
Now that we have constructed these bullet points through *horizontal* concatenation of strings, we can perform a *vertical* one so that we end up with a single string in which we have a bullet point on each line.
|
685 |
+
|
686 |
+
We will use the [`join`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.join.html) expression to do so.
|
687 |
+
"""
|
688 |
+
)
|
689 |
+
return
|
690 |
+
|
691 |
+
|
692 |
+
@app.cell
|
693 |
+
def _(descriptions_df, pl):
|
694 |
+
descriptions_df.select(pl.col('description').str.join('\n'))
|
695 |
+
return
|
696 |
+
|
697 |
+
|
698 |
+
@app.cell(hide_code=True)
|
699 |
+
def _(descriptions_df, mo, pl):
|
700 |
+
mo.md(f"""In fact, since the string we constructed dynamically is valid markdown, we can display it dynamically using Marimo's `mo.md` utility!
|
701 |
+
|
702 |
+
---
|
703 |
+
|
704 |
+
{descriptions_df.select(pl.col('description').str.join('\n')).to_numpy().squeeze().tolist()}
|
705 |
+
""")
|
706 |
+
return
|
707 |
+
|
708 |
+
|
709 |
+
@app.cell(hide_code=True)
|
710 |
+
def _(mo):
|
711 |
+
mo.md(
|
712 |
+
r"""
|
713 |
+
## 🔍 Pattern-based Extraction
|
714 |
+
|
715 |
+
In the vast majority of the cases, when dealing with unstructured text data, all we really want is to extract something structured from it. A common use case is to extract URLs from text to get a better understanding of related content.
|
716 |
+
|
717 |
+
In the example below that's exactly what we do. We scan the `docstring` of each API member and extract URLs from them using [`extract`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.extract.html) and [`extract_all`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.extract_all.html) using a simple regular expression to match http and https URLs.
|
718 |
+
|
719 |
+
Note that `extract` stops after a first match and returns a scalar result (or `null` if there was no match) while `extract_all` returns a - potentially empty - list of matches.
|
720 |
+
"""
|
721 |
+
)
|
722 |
+
return
|
723 |
+
|
724 |
+
|
725 |
+
@app.cell
|
726 |
+
def _(expressions_df, pl):
|
727 |
+
url_pattern = r'(https?://[^\s>]+)'
|
728 |
+
expressions_df.select(
|
729 |
+
'namespace',
|
730 |
+
'member',
|
731 |
+
url_match=pl.col('docstring').str.extract(url_pattern),
|
732 |
+
url_matches=pl.col('docstring').str.extract_all(url_pattern),
|
733 |
+
).filter(pl.col('url_match').is_not_null())
|
734 |
+
return (url_pattern,)
|
735 |
+
|
736 |
+
|
737 |
+
@app.cell(hide_code=True)
|
738 |
+
def _(mo):
|
739 |
+
mo.md(
|
740 |
+
r"""
|
741 |
+
Note that in each `docstring` where a code example involving dataframes is present, we will see an output such as "shape: (5, 2)" indicating the number of rows and columns of the dataframe produced by the sample code. Let's say we would like to *capture* this information in a structured way.
|
742 |
+
|
743 |
+
[`extract_groups`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.extract_groups.html) is a really powerful expression allowing us to achieve exactly that.
|
744 |
+
|
745 |
+
Below we define the regular expression `r"shape:\s*\((?<height>\S+),\s*(?<width>\S+)\)"` with two capture groups, named `height` and `width` and pass it as the parameter of `extract_groups`. After execution, for each `docstring`, we end up with fully structured data we can further process downstream!
|
746 |
+
"""
|
747 |
+
)
|
748 |
+
return
|
749 |
+
|
750 |
+
|
751 |
+
@app.cell
|
752 |
+
def _(expressions_df, pl):
|
753 |
+
expressions_df.select(
|
754 |
+
'namespace',
|
755 |
+
'member',
|
756 |
+
example_df_shape=pl.col('docstring').str.extract_groups(r"shape:\s*\((?<height>\S+),\s*(?<width>\S+)\)"),
|
757 |
+
)
|
758 |
+
return
|
759 |
+
|
760 |
+
|
761 |
+
@app.cell(hide_code=True)
|
762 |
+
def _(mo):
|
763 |
+
mo.md(
|
764 |
+
r"""
|
765 |
+
## 🧹 Stripping
|
766 |
+
|
767 |
+
Strings might require some cleaning before further processing, such as the removal of some characters from the beginning or end of the text. [`strip_chars_start`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_chars_start.html), [`strip_chars_end`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_chars_end.html) and [`strip_chars`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_chars.html) are here to facilitate this.
|
768 |
+
|
769 |
+
All we need to do is to specify a set of characters we would like to get rid of and Polars handles the rest for us.
|
770 |
+
"""
|
771 |
+
)
|
772 |
+
return
|
773 |
+
|
774 |
+
|
775 |
+
@app.cell
|
776 |
+
def _(expressions_df, pl):
|
777 |
+
expressions_df.select(
|
778 |
+
"member",
|
779 |
+
member_front_stripped=pl.col("member").str.strip_chars_start("a"),
|
780 |
+
member_back_stripped=pl.col("member").str.strip_chars_end("n"),
|
781 |
+
member_fully_stripped=pl.col("member").str.strip_chars("na"),
|
782 |
+
)
|
783 |
+
return
|
784 |
+
|
785 |
+
|
786 |
+
@app.cell(hide_code=True)
|
787 |
+
def _(mo):
|
788 |
+
mo.md(
|
789 |
+
r"""
|
790 |
+
Note that when using the above expressions, the specified characters do not need to form a sequence; they are handled as a set. However, in certain use cases we only want to strip complete substrings, so we would need our input to be strictly treated as a sequence rather than as a set.
|
791 |
+
|
792 |
+
That's exactly the rationale behind [`strip_prefix`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_prefix.html) and [`strip_suffix`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strip_suffix.html).
|
793 |
+
|
794 |
+
Below we use these to remove the `"to_"` prefixes and `"_with"` suffixes from each member name.
|
795 |
+
"""
|
796 |
+
)
|
797 |
+
return
|
798 |
+
|
799 |
+
|
800 |
+
@app.cell
|
801 |
+
def _(expressions_df, pl):
|
802 |
+
expressions_df.select(
|
803 |
+
"member",
|
804 |
+
member_prefix_stripped=pl.col("member").str.strip_prefix("to_"),
|
805 |
+
member_suffix_stripped=pl.col("member").str.strip_suffix("_with"),
|
806 |
+
).slice(20)
|
807 |
+
return
|
808 |
+
|
809 |
+
|
810 |
+
@app.cell(hide_code=True)
|
811 |
+
def _(mo):
|
812 |
+
mo.md(
|
813 |
+
r"""
|
814 |
+
## 🔑 Encoding & Decoding
|
815 |
+
|
816 |
+
Should you find yourself in the need of encoding your strings into [base64](https://en.wikipedia.org/wiki/Base64) or [hexadecimal](https://en.wikipedia.org/wiki/Hexadecimal) format, then Polars has your back with its [`encode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.encode.html) expression.
|
817 |
+
"""
|
818 |
+
)
|
819 |
+
return
|
820 |
+
|
821 |
+
|
822 |
+
@app.cell
|
823 |
+
def _(expressions_df, pl):
|
824 |
+
encoded_df = expressions_df.select(
|
825 |
+
"member",
|
826 |
+
member_base64=pl.col('member').str.encode('base64'),
|
827 |
+
member_hex=pl.col('member').str.encode('hex'),
|
828 |
+
)
|
829 |
+
encoded_df
|
830 |
+
return (encoded_df,)
|
831 |
+
|
832 |
+
|
833 |
+
@app.cell(hide_code=True)
|
834 |
+
def _(mo):
|
835 |
+
mo.md(r"""And of course, you can convert back into a human-readable representation using the [`decode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.decode.html) expression.""")
|
836 |
+
return
|
837 |
+
|
838 |
+
|
839 |
+
@app.cell
|
840 |
+
def _(encoded_df, pl):
|
841 |
+
encoded_df.with_columns(
|
842 |
+
member_base64_decoded=pl.col('member_base64').str.decode('base64').cast(pl.String),
|
843 |
+
member_hex_decoded=pl.col('member_hex').str.decode('hex').cast(pl.String),
|
844 |
+
)
|
845 |
+
return
|
846 |
+
|
847 |
+
|
848 |
+
@app.cell(hide_code=True)
|
849 |
+
def _(mo):
|
850 |
+
mo.md(
|
851 |
+
r"""
|
852 |
+
## 🚀 Application: Dynamic Execution of Polars Examples
|
853 |
+
|
854 |
+
Now that we are familiar with string expressions, we can combine them with other Polars operations to build a fully interactive playground where code examples of Polars expressions can be explored.
|
855 |
+
|
856 |
+
We make use of string expressions to extract the raw Python source code of examples from the docstrings and we leverage the interactive Marimo environment to enable the selection of expressions via a searchable dropdown and a fully functional code editor whose output is rendered with Marimo's rich display utilities.
|
857 |
+
|
858 |
+
In other words, we will use Polars to execute Polars. ❄️ How cool is that?
|
859 |
+
|
860 |
+
---
|
861 |
+
"""
|
862 |
+
)
|
863 |
+
return
|
864 |
+
|
865 |
+
|
866 |
+
@app.cell(hide_code=True)
|
867 |
+
def _(
|
868 |
+
example_editor,
|
869 |
+
execution_result,
|
870 |
+
expression,
|
871 |
+
expression_description,
|
872 |
+
expression_docs_link,
|
873 |
+
mo,
|
874 |
+
):
|
875 |
+
mo.vstack(
|
876 |
+
[
|
877 |
+
mo.md(f'### {expression.value}'),
|
878 |
+
expression,
|
879 |
+
mo.hstack([expression_description, expression_docs_link]),
|
880 |
+
example_editor,
|
881 |
+
execution_result,
|
882 |
+
]
|
883 |
+
)
|
884 |
+
return
|
885 |
+
|
886 |
+
|
887 |
+
@app.cell(hide_code=True)
|
888 |
+
def _(mo, selected_expression_record):
|
889 |
+
expression_description = mo.md(selected_expression_record["description"])
|
890 |
+
expression_docs_link = mo.md(
|
891 |
+
f"🐻❄️ [Official Docs](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.{selected_expression_record['expr']}.html)"
|
892 |
+
)
|
893 |
+
return expression_description, expression_docs_link
|
894 |
+
|
895 |
+
|
896 |
+
@app.cell(hide_code=True)
|
897 |
+
def _(example_editor, execute_code):
|
898 |
+
execution_result = execute_code(example_editor.value)
|
899 |
+
return (execution_result,)
|
900 |
+
|
901 |
+
|
902 |
+
@app.cell(hide_code=True)
|
903 |
+
def _(code_df, mo):
|
904 |
+
expression = mo.ui.dropdown(code_df.get_column('expr'), value='arr.all', searchable=True)
|
905 |
+
return (expression,)
|
906 |
+
|
907 |
+
|
908 |
+
@app.cell(hide_code=True)
|
909 |
+
def _(code_df, expression):
|
910 |
+
selected_expression_record = code_df.filter(expr=expression.value).to_dicts()[0]
|
911 |
+
return (selected_expression_record,)
|
912 |
+
|
913 |
+
|
914 |
+
@app.cell(hide_code=True)
|
915 |
+
def _(mo, selected_expression_record):
|
916 |
+
example_editor = mo.ui.code_editor(value=selected_expression_record["code"])
|
917 |
+
return (example_editor,)
|
918 |
+
|
919 |
+
|
920 |
+
@app.cell(hide_code=True)
|
921 |
+
def _(expressions_df, pl):
|
922 |
+
code_df = (
|
923 |
+
expressions_df.select(
|
924 |
+
expr=pl.when(pl.col("namespace") == "root")
|
925 |
+
.then("member")
|
926 |
+
.otherwise(pl.concat_str(["namespace", "member"], separator=".")),
|
927 |
+
description=pl.col("docstring")
|
928 |
+
.str.split("\n\n")
|
929 |
+
.list.get(0)
|
930 |
+
.str.slice(9),
|
931 |
+
docstring_lines=pl.col("docstring").str.split("\n"),
|
932 |
+
)
|
933 |
+
.with_row_index()
|
934 |
+
.explode("docstring_lines")
|
935 |
+
.rename({"docstring_lines": "docstring_line"})
|
936 |
+
.with_columns(pl.col("docstring_line").str.strip_chars(" "))
|
937 |
+
.filter(pl.col("docstring_line").str.contains_any([">>> ", "... "]))
|
938 |
+
.with_columns(pl.col("docstring_line").str.slice(4))
|
939 |
+
.group_by(pl.exclude("docstring_line"), maintain_order=True)
|
940 |
+
.agg(code=pl.col("docstring_line").str.join("\n"))
|
941 |
+
.drop("index")
|
942 |
+
)
|
943 |
+
return (code_df,)
|
944 |
+
|
945 |
+
|
946 |
+
@app.cell(hide_code=True)
|
947 |
+
def _():
|
948 |
+
def execute_code(code: str):
|
949 |
+
import ast
|
950 |
+
|
951 |
+
# Create a new local namespace for execution
|
952 |
+
local_namespace = {}
|
953 |
+
|
954 |
+
# Parse the code into an AST to identify the last expression
|
955 |
+
parsed_code = ast.parse(code)
|
956 |
+
|
957 |
+
# Check if there's at least one statement
|
958 |
+
if not parsed_code.body:
|
959 |
+
return None
|
960 |
+
|
961 |
+
# If the last statement is an expression, we'll need to get its value
|
962 |
+
last_is_expr = isinstance(parsed_code.body[-1], ast.Expr)
|
963 |
+
|
964 |
+
if last_is_expr:
|
965 |
+
# Split the code: everything except the last statement, and the last statement
|
966 |
+
last_expr = ast.Expression(parsed_code.body[-1].value)
|
967 |
+
|
968 |
+
# Remove the last statement from the parsed code
|
969 |
+
parsed_code.body = parsed_code.body[:-1]
|
970 |
+
|
971 |
+
# Execute everything except the last statement
|
972 |
+
if parsed_code.body:
|
973 |
+
exec(
|
974 |
+
compile(parsed_code, "<string>", "exec"),
|
975 |
+
globals(),
|
976 |
+
local_namespace,
|
977 |
+
)
|
978 |
+
|
979 |
+
# Execute the last statement and get its value
|
980 |
+
result = eval(
|
981 |
+
compile(last_expr, "<string>", "eval"), globals(), local_namespace
|
982 |
+
)
|
983 |
+
return result
|
984 |
+
else:
|
985 |
+
# If the last statement is not an expression (e.g., an assignment),
|
986 |
+
# execute the entire code and return None
|
987 |
+
exec(code, globals(), local_namespace)
|
988 |
+
return None
|
989 |
+
return (execute_code,)
|
990 |
+
|
991 |
+
|
992 |
+
@app.cell(hide_code=True)
|
993 |
+
def _():
|
994 |
+
import polars as pl
|
995 |
+
import marimo as mo
|
996 |
+
import altair as alt
|
997 |
+
import random
|
998 |
+
|
999 |
+
random.seed(42)
|
1000 |
+
return alt, mo, pl, random
|
1001 |
+
|
1002 |
+
|
1003 |
+
if __name__ == "__main__":
|
1004 |
+
app.run()
|
polars/14_user_defined_functions.py
ADDED
@@ -0,0 +1,946 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# requires-python = ">=3.12"
|
3 |
+
# dependencies = [
|
4 |
+
# "altair==5.5.0",
|
5 |
+
# "beautifulsoup4==4.13.3",
|
6 |
+
# "httpx==0.28.1",
|
7 |
+
# "marimo",
|
8 |
+
# "nest-asyncio==1.6.0",
|
9 |
+
# "numba==0.61.0",
|
10 |
+
# "numpy==2.1.3",
|
11 |
+
# "polars==1.24.0",
|
12 |
+
# ]
|
13 |
+
# ///
|
14 |
+
|
15 |
+
import marimo
|
16 |
+
|
17 |
+
__generated_with = "0.11.17"
|
18 |
+
app = marimo.App(width="medium")
|
19 |
+
|
20 |
+
|
21 |
+
@app.cell(hide_code=True)
|
22 |
+
def _(mo):
|
23 |
+
mo.md(
|
24 |
+
r"""
|
25 |
+
# User-Defined Functions
|
26 |
+
|
27 |
+
_By [Péter Ferenc Gyarmati](http://github.com/peter-gy)_.
|
28 |
+
|
29 |
+
Throughout the previous chapters, you've seen how Polars provides a comprehensive set of built-in expressions for flexible data transformation. But what happens when you need something *more*? Perhaps your project has unique requirements, or you need to integrate functionality from an external Python library. This is where User-Defined Functions (UDFs) come into play, allowing you to extend Polars with your own custom logic.
|
30 |
+
|
31 |
+
In this chapter, we'll weigh the performance trade-offs of UDFs, pinpoint situations where they're truly beneficial, and explore different ways to effectively incorporate them into your Polars workflows. We'll walk through a complete, practical example.
|
32 |
+
"""
|
33 |
+
)
|
34 |
+
return
|
35 |
+
|
36 |
+
|
37 |
+
@app.cell(hide_code=True)
|
38 |
+
def _(mo):
|
39 |
+
mo.md(
|
40 |
+
r"""
|
41 |
+
## ⚖️ The Cost of UDFs
|
42 |
+
|
43 |
+
> Performance vs. Flexibility
|
44 |
+
|
45 |
+
Polars' built-in expressions are highly optimized for speed and parallel processing. User-defined functions (UDFs), however, introduce a significant performance overhead because they rely on standard Python code, which often runs in a single thread and bypasses Polars' logical optimizations. Therefore, always prioritize native Polars operations *whenever possible*.
|
46 |
+
|
47 |
+
However, UDFs become inevitable when you need to:
|
48 |
+
|
49 |
+
- **Integrate external libraries:** Use functionality not directly available in Polars.
|
50 |
+
- **Implement custom logic:** Handle complex transformations that can't be easily expressed with Polars' built-in functions.
|
51 |
+
|
52 |
+
Let's dive into a real-world project where UDFs were the only way to get the job done, demonstrating a scenario where native Polars expressions simply weren't sufficient.
|
53 |
+
"""
|
54 |
+
)
|
55 |
+
return
|
56 |
+
|
57 |
+
|
58 |
+
@app.cell(hide_code=True)
|
59 |
+
def _(mo):
|
60 |
+
mo.md(
|
61 |
+
r"""
|
62 |
+
## 📊 Project Overview
|
63 |
+
|
64 |
+
> Scraping and Analyzing Observable Notebook Statistics
|
65 |
+
|
66 |
+
If you're into data visualization, you've probably seen [D3.js](https://d3js.org/) and [Observable Plot](https://observablehq.com/plot/). Both have extensive galleries showcasing amazing visualizations. Each gallery item is a standalone [Observable notebook](https://observablehq.com/documentation/notebooks/), with metrics like stars, comments, and forks – indicators of popularity. But getting and analyzing these statistics directly isn't straightforward. We'll need to scrape the web.
|
67 |
+
"""
|
68 |
+
)
|
69 |
+
return
|
70 |
+
|
71 |
+
|
72 |
+
@app.cell(hide_code=True)
|
73 |
+
def _(mo):
|
74 |
+
mo.hstack(
|
75 |
+
[
|
76 |
+
mo.image(
|
77 |
+
"https://minio.peter.gy/static/assets/marimo/learn/polars/14_d3-gallery.png?0",
|
78 |
+
width=600,
|
79 |
+
caption="Screenshot of https://observablehq.com/@d3/gallery",
|
80 |
+
),
|
81 |
+
mo.image(
|
82 |
+
"https://minio.peter.gy/static/assets/marimo/learn/polars/14_plot-gallery.png?0",
|
83 |
+
width=600,
|
84 |
+
caption="Screenshot of https://observablehq.com/@observablehq/plot-gallery",
|
85 |
+
),
|
86 |
+
]
|
87 |
+
)
|
88 |
+
return
|
89 |
+
|
90 |
+
|
91 |
+
@app.cell(hide_code=True)
|
92 |
+
def _(mo):
|
93 |
+
mo.md(r"""Our goal is to use Polars UDFs to fetch the HTML content of these gallery pages. Then, we'll use the `BeautifulSoup` Python library to parse the HTML and extract the relevant metadata. After some data wrangling with native Polars expressions, we'll have a DataFrame listing each visualization notebook. Then, we'll use another UDF to retrieve the number of likes, forks, and comments for each notebook. Finally, we will create our own high-performance UDF to implement a custom notebook ranking scheme. This will involve multiple steps, showcasing different UDF approaches.""")
|
94 |
+
return
|
95 |
+
|
96 |
+
|
97 |
+
@app.cell(hide_code=True)
|
98 |
+
def _(mo):
|
99 |
+
mo.mermaid('''
|
100 |
+
graph LR;
|
101 |
+
url_df --> |"UDF: Fetch HTML"| html_df
|
102 |
+
html_df --> |"UDF: Parse with BeautifulSoup"| parsed_html_df
|
103 |
+
parsed_html_df --> |"Native Polars: Extract Data"| notebooks_df
|
104 |
+
notebooks_df --> |"UDF: Get Notebook Stats"| notebook_stats_df
|
105 |
+
notebook_stats_df --> |"Numba UDF: Compute Popularity"| notebook_popularity_df
|
106 |
+
''')
|
107 |
+
return
|
108 |
+
|
109 |
+
|
110 |
+
@app.cell(hide_code=True)
|
111 |
+
def _(mo):
|
112 |
+
mo.md(r"""Our starting point, `url_df`, is a simple DataFrame with a single `url` column containing the URLs of the D3 and Observable Plot gallery notebooks.""")
|
113 |
+
return
|
114 |
+
|
115 |
+
|
116 |
+
@app.cell(hide_code=True)
|
117 |
+
def _(pl):
|
118 |
+
url_df = pl.from_dict(
|
119 |
+
{
|
120 |
+
"url": [
|
121 |
+
"https://observablehq.com/@d3/gallery",
|
122 |
+
"https://observablehq.com/@observablehq/plot-gallery",
|
123 |
+
]
|
124 |
+
}
|
125 |
+
)
|
126 |
+
url_df
|
127 |
+
return (url_df,)
|
128 |
+
|
129 |
+
|
130 |
+
@app.cell(hide_code=True)
|
131 |
+
def _(mo):
|
132 |
+
mo.md(
|
133 |
+
r"""
|
134 |
+
## 🔂 Element-Wise UDFs
|
135 |
+
|
136 |
+
> Processing Value by Value
|
137 |
+
|
138 |
+
The most common way to use UDFs is to apply them element-wise. This means our custom function will execute for *each individual row* in a specified column. Our first task is to fetch the HTML content for each URL in `url_df`.
|
139 |
+
|
140 |
+
We'll define a Python function that takes a `url` (a string) as input, uses the `httpx` library (an HTTP client) to fetch the content, and returns the HTML as a string. We then integrate this function into Polars using the [`map_elements`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_elements.html) expression.
|
141 |
+
|
142 |
+
You'll notice we have to explicitly specify the `return_dtype`. This is *crucial*. Polars doesn't automatically know what our custom function will return. We're responsible for defining the function's logic and, therefore, its output type. By providing the `return_dtype`, we help Polars maintain its internal representation of the DataFrame's schema, enabling query optimization. Think of it as giving Polars a "heads-up" about the data type it should expect.
|
143 |
+
"""
|
144 |
+
)
|
145 |
+
return
|
146 |
+
|
147 |
+
|
148 |
+
@app.cell(hide_code=True)
|
149 |
+
def _(httpx, pl, url_df):
|
150 |
+
html_df = url_df.with_columns(
|
151 |
+
html=pl.col("url").map_elements(
|
152 |
+
lambda url: httpx.get(url).text,
|
153 |
+
return_dtype=pl.String,
|
154 |
+
)
|
155 |
+
)
|
156 |
+
html_df
|
157 |
+
return (html_df,)
|
158 |
+
|
159 |
+
|
160 |
+
@app.cell(hide_code=True)
|
161 |
+
def _(mo):
|
162 |
+
mo.md(
|
163 |
+
r"""
|
164 |
+
Now, `html_df` holds the HTML for each URL. We need to parse it. Again, a UDF is the way to go. Parsing HTML with native Polars expressions would be a nightmare! Instead, we'll use the [`beautifulsoup4`](https://pypi.org/project/beautifulsoup4/) library, a standard tool for this.
|
165 |
+
|
166 |
+
These Observable pages are built with [Next.js](https://nextjs.org/), which helpfully serializes page properties as JSON within the HTML. This simplifies our UDF: we'll extract the raw JSON from the `<script id="__NEXT_DATA__" type="application/json">` tag. We'll use [`map_elements`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_elements.html) again. For clarity, we'll define this UDF as a named function, `extract_nextjs_data`, since it's a bit more complex than a simple HTTP request.
|
167 |
+
"""
|
168 |
+
)
|
169 |
+
return
|
170 |
+
|
171 |
+
|
172 |
+
@app.cell(hide_code=True)
|
173 |
+
def _(BeautifulSoup):
|
174 |
+
def extract_nextjs_data(html: str) -> str:
|
175 |
+
soup = BeautifulSoup(html, "html.parser")
|
176 |
+
script_tag = soup.find("script", id="__NEXT_DATA__")
|
177 |
+
return script_tag.text
|
178 |
+
return (extract_nextjs_data,)
|
179 |
+
|
180 |
+
|
181 |
+
@app.cell(hide_code=True)
|
182 |
+
def _(extract_nextjs_data, html_df, pl):
|
183 |
+
parsed_html_df = html_df.select(
|
184 |
+
"url",
|
185 |
+
next_data=pl.col("html").map_elements(
|
186 |
+
extract_nextjs_data,
|
187 |
+
return_dtype=pl.String,
|
188 |
+
),
|
189 |
+
)
|
190 |
+
parsed_html_df
|
191 |
+
return (parsed_html_df,)
|
192 |
+
|
193 |
+
|
194 |
+
@app.cell(hide_code=True)
|
195 |
+
def _(mo):
|
196 |
+
mo.md(r"""With some data wrangling of the raw JSON (using *native* Polars expressions!), we get `notebooks_df`, containing the metadata for each notebook.""")
|
197 |
+
return
|
198 |
+
|
199 |
+
|
200 |
+
@app.cell(hide_code=True)
|
201 |
+
def _(parsed_html_df, pl):
|
202 |
+
notebooks_df = (
|
203 |
+
parsed_html_df.select(
|
204 |
+
"url",
|
205 |
+
# We extract the content of every cell present in the gallery notebooks
|
206 |
+
cell=pl.col("next_data")
|
207 |
+
.str.json_path_match("$.props.pageProps.initialNotebook.nodes")
|
208 |
+
.str.json_decode()
|
209 |
+
.list.eval(pl.element().struct.field("value")),
|
210 |
+
)
|
211 |
+
# We want one row per cell
|
212 |
+
.explode("cell")
|
213 |
+
# Only keep categorized notebook listing cells starting with H3
|
214 |
+
.filter(pl.col("cell").str.starts_with("### "))
|
215 |
+
# Split up the cells into [heading, description, config] sections
|
216 |
+
.with_columns(pl.col("cell").str.split("\n\n"))
|
217 |
+
.select(
|
218 |
+
gallery_url="url",
|
219 |
+
# Text after the '### ' heading, ignore '<!--' comments'
|
220 |
+
category=pl.col("cell").list.get(0).str.extract(r"###\s+(.*?)(?:\s+<!--.*?-->|$)"),
|
221 |
+
# Paragraph after heading
|
222 |
+
description=pl.col("cell")
|
223 |
+
.list.get(1)
|
224 |
+
.str.strip_chars(" ")
|
225 |
+
.str.replace_all("](/", "](https://observablehq.com/", literal=True),
|
226 |
+
# Parsed notebook config from ${preview([{...}])}
|
227 |
+
notebooks=pl.col("cell")
|
228 |
+
.list.get(2)
|
229 |
+
.str.strip_prefix("${previews([")
|
230 |
+
.str.strip_suffix("]})}")
|
231 |
+
.str.strip_chars(" \n")
|
232 |
+
.str.split("},")
|
233 |
+
# Simple regex-based attribute extraction from JS/JSON objects like
|
234 |
+
# ```js
|
235 |
+
# {
|
236 |
+
# path: "@d3/spilhaus-shoreline-map",
|
237 |
+
# "thumbnail": "66a87355e205d820...",
|
238 |
+
# title: "Spilhaus shoreline map",
|
239 |
+
# "author": "D3"
|
240 |
+
# }
|
241 |
+
# ```
|
242 |
+
.list.eval(
|
243 |
+
pl.struct(
|
244 |
+
*(
|
245 |
+
pl.element()
|
246 |
+
.str.extract(f'(?:"{key}"|{key})\s*:\s*"([^"]*)"')
|
247 |
+
.alias(key)
|
248 |
+
for key in ["path", "thumbnail", "title"]
|
249 |
+
)
|
250 |
+
)
|
251 |
+
),
|
252 |
+
)
|
253 |
+
.explode("notebooks")
|
254 |
+
.unnest("notebooks")
|
255 |
+
.filter(pl.col("path").is_not_null())
|
256 |
+
# Final projection to end up with directly usable values
|
257 |
+
.select(
|
258 |
+
pl.concat_str(
|
259 |
+
[
|
260 |
+
pl.lit("https://static.observableusercontent.com/thumbnail/"),
|
261 |
+
"thumbnail",
|
262 |
+
pl.lit(".jpg"),
|
263 |
+
],
|
264 |
+
).alias("notebook_thumbnail_src"),
|
265 |
+
"category",
|
266 |
+
"title",
|
267 |
+
"description",
|
268 |
+
pl.concat_str(
|
269 |
+
[pl.lit("https://observablehq.com"), "path"], separator="/"
|
270 |
+
).alias("notebook_url"),
|
271 |
+
)
|
272 |
+
)
|
273 |
+
notebooks_df
|
274 |
+
return (notebooks_df,)
|
275 |
+
|
276 |
+
|
277 |
+
@app.cell(hide_code=True)
|
278 |
+
def _(mo):
|
279 |
+
mo.md(
|
280 |
+
r"""
|
281 |
+
## 📦 Batch-Wise UDFs
|
282 |
+
|
283 |
+
> Processing Entire Series
|
284 |
+
|
285 |
+
`map_elements` calls the UDF for *each row*. Fine for our tiny, two-rows-tall `url_df`. But `notebooks_df` has almost 400 rows! Individual HTTP requests for each would be painfully slow.
|
286 |
+
|
287 |
+
We want stats for each notebook in `notebooks_df`. To avoid sequential requests, we'll use Polars' [`map_batches`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.map_batches.html). This lets us process an *entire Series* (a column) at once.
|
288 |
+
|
289 |
+
Our UDF, `fetch_html_batch`, will take a *Series* of URLs and use `asyncio` to make concurrent requests – a huge performance boost.
|
290 |
+
"""
|
291 |
+
)
|
292 |
+
return
|
293 |
+
|
294 |
+
|
295 |
+
@app.cell(hide_code=True)
|
296 |
+
def _(Iterable, asyncio, httpx, mo):
|
297 |
+
async def _fetch_html_batch(urls: Iterable[str]) -> tuple[str, ...]:
|
298 |
+
async with httpx.AsyncClient(timeout=15) as client:
|
299 |
+
res = await asyncio.gather(*(client.get(url) for url in urls))
|
300 |
+
return tuple((r.text for r in res))
|
301 |
+
|
302 |
+
|
303 |
+
@mo.cache
|
304 |
+
def fetch_html_batch(urls: Iterable[str]) -> tuple[str, ...]:
|
305 |
+
return asyncio.run(_fetch_html_batch(urls))
|
306 |
+
return (fetch_html_batch,)
|
307 |
+
|
308 |
+
|
309 |
+
@app.cell(hide_code=True)
|
310 |
+
def _(mo):
|
311 |
+
mo.callout(
|
312 |
+
mo.md("""
|
313 |
+
Since `fetch_html_batch` is a pure Python function and performs multiple network requests, it's a good candidate for caching. We use [`mo.cache`](https://docs.marimo.io/api/caching/#marimo.cache) to avoid redundant requests to the same URL. This is a simple way to improve performance without modifying the core logic.
|
314 |
+
"""
|
315 |
+
),
|
316 |
+
kind="info",
|
317 |
+
)
|
318 |
+
return
|
319 |
+
|
320 |
+
|
321 |
+
@app.cell(hide_code=True)
|
322 |
+
def _(mo, notebooks_df):
|
323 |
+
category = mo.ui.dropdown(
|
324 |
+
notebooks_df.sort("category").get_column("category"),
|
325 |
+
value="Maps",
|
326 |
+
)
|
327 |
+
return (category,)
|
328 |
+
|
329 |
+
|
330 |
+
@app.cell(hide_code=True)
|
331 |
+
def _(category, extract_nextjs_data, fetch_html_batch, notebooks_df, pl):
|
332 |
+
notebook_stats_df = (
|
333 |
+
# Setting filter upstream to limit number of concurrent HTTP requests
|
334 |
+
notebooks_df.filter(category=category.value)
|
335 |
+
.with_columns(
|
336 |
+
notebook_html=pl.col("notebook_url")
|
337 |
+
.map_batches(fetch_html_batch, return_dtype=pl.List(pl.String))
|
338 |
+
.explode()
|
339 |
+
)
|
340 |
+
.with_columns(
|
341 |
+
notebook_data=pl.col("notebook_html")
|
342 |
+
.map_elements(
|
343 |
+
extract_nextjs_data,
|
344 |
+
return_dtype=pl.String,
|
345 |
+
)
|
346 |
+
.str.json_path_match("$.props.pageProps.initialNotebook")
|
347 |
+
.str.json_decode()
|
348 |
+
)
|
349 |
+
.drop("notebook_html")
|
350 |
+
.with_columns(
|
351 |
+
*[
|
352 |
+
pl.col("notebook_data").struct.field(key).alias(key)
|
353 |
+
for key in ["likes", "forks", "comments", "license"]
|
354 |
+
]
|
355 |
+
)
|
356 |
+
.drop("notebook_data")
|
357 |
+
.with_columns(pl.col("comments").list.len())
|
358 |
+
.select(
|
359 |
+
pl.exclude("description", "notebook_url"),
|
360 |
+
"description",
|
361 |
+
"notebook_url",
|
362 |
+
)
|
363 |
+
.sort("likes", descending=True)
|
364 |
+
)
|
365 |
+
return (notebook_stats_df,)
|
366 |
+
|
367 |
+
|
368 |
+
@app.cell(hide_code=True)
|
369 |
+
def _(mo, notebook_stats_df):
|
370 |
+
notebooks = mo.ui.table(notebook_stats_df, selection='single', initial_selection=[2], page_size=5)
|
371 |
+
notebook_height = mo.ui.slider(start=400, stop=2000, value=825, step=25, show_value=True, label='Notebook Height')
|
372 |
+
return notebook_height, notebooks
|
373 |
+
|
374 |
+
|
375 |
+
@app.cell(hide_code=True)
|
376 |
+
def _():
|
377 |
+
def nb_iframe(notebook_url: str, height=825) -> str:
|
378 |
+
embed_url = notebook_url.replace(
|
379 |
+
"https://observablehq.com", "https://observablehq.com/embed"
|
380 |
+
)
|
381 |
+
return f'<iframe width="100%" height="{height}" frameborder="0" src="{embed_url}?cell=*"></iframe>'
|
382 |
+
return (nb_iframe,)
|
383 |
+
|
384 |
+
|
385 |
+
@app.cell(hide_code=True)
|
386 |
+
def _(mo):
|
387 |
+
mo.md(r"""Now that we have access to notebook-level statistics, we can rank the visualizations by the number of likes they received & display them interactively.""")
|
388 |
+
return
|
389 |
+
|
390 |
+
|
391 |
+
@app.cell(hide_code=True)
|
392 |
+
def _(mo):
|
393 |
+
mo.callout("💡 Explore the visualizations by paging through the table below and selecting any of its rows.")
|
394 |
+
return
|
395 |
+
|
396 |
+
|
397 |
+
@app.cell(hide_code=True)
|
398 |
+
def _(category, mo, nb_iframe, notebook_height, notebooks):
|
399 |
+
notebook = notebooks.value.to_dicts()[0]
|
400 |
+
mo.vstack(
|
401 |
+
[
|
402 |
+
mo.hstack([category, notebook_height]),
|
403 |
+
notebooks,
|
404 |
+
mo.md(f"{notebook['description']}"),
|
405 |
+
mo.md('---'),
|
406 |
+
mo.md(nb_iframe(notebook["notebook_url"], notebook_height.value)),
|
407 |
+
]
|
408 |
+
)
|
409 |
+
return (notebook,)
|
410 |
+
|
411 |
+
|
412 |
+
@app.cell(hide_code=True)
|
413 |
+
def _(mo):
|
414 |
+
mo.md(
|
415 |
+
r"""
|
416 |
+
## ⚙️ Row-Wise UDFs
|
417 |
+
|
418 |
+
> Accessing All Columns at Once
|
419 |
+
|
420 |
+
Sometimes, you need to work with *all* columns of a row at once. This is where [`map_rows`](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.map_rows.html) comes in. It operates directly on the DataFrame, passing each row to your UDF *as a tuple*.
|
421 |
+
|
422 |
+
Below, `create_notebook_summary` takes a row from `notebook_stats_df` (as a tuple) and returns a formatted Markdown string summarizing the notebook's key stats. We're essentially reducing the DataFrame to a single column. While this *could* be done with native Polars expressions, it would be much more cumbersome. This example demonstrates a case where a row-wise UDF simplifies the code, even if the underlying operation isn't inherently complex.
|
423 |
+
"""
|
424 |
+
)
|
425 |
+
return
|
426 |
+
|
427 |
+
|
428 |
+
@app.cell(hide_code=True)
|
429 |
+
def _():
|
430 |
+
def create_notebook_summary(row: tuple) -> str:
|
431 |
+
(
|
432 |
+
thumbnail_src,
|
433 |
+
category,
|
434 |
+
title,
|
435 |
+
likes,
|
436 |
+
forks,
|
437 |
+
comments,
|
438 |
+
license,
|
439 |
+
description,
|
440 |
+
notebook_url,
|
441 |
+
) = row
|
442 |
+
return (
|
443 |
+
f"""
|
444 |
+
### [{title}]({notebook_url})
|
445 |
+
|
446 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin: 12px 0;">
|
447 |
+
<div>⭐ <strong>Likes:</strong> {likes}</div>
|
448 |
+
<div>↗️ <strong>Forks:</strong> {forks}</div>
|
449 |
+
<div>💬 <strong>Comments:</strong> {comments}</div>
|
450 |
+
<div>⚖️ <strong>License:</strong> {license}</div>
|
451 |
+
</div>
|
452 |
+
|
453 |
+
<a href="{notebook_url}" target="_blank">
|
454 |
+
<img src="{thumbnail_src}" style="height: 300px;" />
|
455 |
+
<a/>
|
456 |
+
""".strip('\n')
|
457 |
+
)
|
458 |
+
return (create_notebook_summary,)
|
459 |
+
|
460 |
+
|
461 |
+
@app.cell(hide_code=True)
|
462 |
+
def _(create_notebook_summary, notebook_stats_df, pl):
|
463 |
+
notebook_summary_df = notebook_stats_df.map_rows(
|
464 |
+
create_notebook_summary,
|
465 |
+
return_dtype=pl.String,
|
466 |
+
).rename({"map": "summary"})
|
467 |
+
notebook_summary_df.head(1)
|
468 |
+
return (notebook_summary_df,)
|
469 |
+
|
470 |
+
|
471 |
+
@app.cell(hide_code=True)
|
472 |
+
def _(mo):
|
473 |
+
mo.callout("💡 You can explore individual notebook statistics through the carousel. Discover the visualization's source code by clicking the notebook title or the thumbnail.")
|
474 |
+
return
|
475 |
+
|
476 |
+
|
477 |
+
@app.cell(hide_code=True)
|
478 |
+
def _(mo, notebook_summary_df):
|
479 |
+
mo.carousel(
|
480 |
+
[
|
481 |
+
mo.lazy(mo.md(summary))
|
482 |
+
for summary in notebook_summary_df.get_column("summary")
|
483 |
+
]
|
484 |
+
)
|
485 |
+
return
|
486 |
+
|
487 |
+
|
488 |
+
@app.cell(hide_code=True)
|
489 |
+
def _(mo):
|
490 |
+
mo.md(
|
491 |
+
r"""
|
492 |
+
## 🚀 Higher-performance UDFs
|
493 |
+
|
494 |
+
> Leveraging Numba to Make Python Fast
|
495 |
+
|
496 |
+
Python code doesn't *always* mean slow code. While UDFs *often* introduce performance overhead, there are exceptions. NumPy's universal functions ([`ufuncs`](https://numpy.org/doc/stable/reference/ufuncs.html)) and generalized universal functions ([`gufuncs`](https://numpy.org/neps/nep-0005-generalized-ufuncs.html)) provide high-performance operations on NumPy arrays, thanks to low-level implementations.
|
497 |
+
|
498 |
+
But NumPy's built-in functions are predefined. We can't easily use them for *custom* logic. Enter [`numba`](https://numba.pydata.org/). Numba is a just-in-time (JIT) compiler that translates Python functions into optimized machine code *at runtime*. It provides decorators like [`numba.guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) that let us create our *own* high-performance `gufuncs` – *without* writing low-level code!
|
499 |
+
"""
|
500 |
+
)
|
501 |
+
return
|
502 |
+
|
503 |
+
|
504 |
+
@app.cell(hide_code=True)
|
505 |
+
def _(mo):
|
506 |
+
mo.md(
|
507 |
+
r"""
|
508 |
+
Let's create a custom popularity metric to rank notebooks, considering likes, forks, *and* comments (not just likes). We'll define `weighted_popularity_numba`, decorated with `@numba.guvectorize`. The decorator arguments specify that we're taking three integer vectors of length `n` and returning a float vector of length `n`.
|
509 |
+
|
510 |
+
The weighted popularity score for each notebook is calculated using the following formula:
|
511 |
+
|
512 |
+
$$
|
513 |
+
\begin{equation}
|
514 |
+
\text{score}_i = w_l \cdot l_i^{f} + w_f \cdot f_i^{f} + w_c \cdot c_i^{f}
|
515 |
+
\end{equation}
|
516 |
+
$$
|
517 |
+
|
518 |
+
with:
|
519 |
+
"""
|
520 |
+
)
|
521 |
+
return
|
522 |
+
|
523 |
+
|
524 |
+
@app.cell(hide_code=True)
|
525 |
+
def _(mo, non_linear_factor, weight_comments, weight_forks, weight_likes):
|
526 |
+
mo.md(rf"""
|
527 |
+
| Symbol | Description |
|
528 |
+
|--------|-------------|
|
529 |
+
| $\text{{score}}_i$ | Popularity score for the *i*-th notebook |
|
530 |
+
| $w_l = {weight_likes.value}$ | Weight for likes |
|
531 |
+
| $l_i$ | Number of likes for the *i*-th notebook |
|
532 |
+
| $w_f = {weight_forks.value}$ | Weight for forks |
|
533 |
+
| $f_i$ | Number of forks for the *i*-th notebook |
|
534 |
+
| $w_c = {weight_comments.value}$ | Weight for comments |
|
535 |
+
| $c_i$ | Number of comments for the *i*-th notebook |
|
536 |
+
| $f = {non_linear_factor.value}$ | Non-linear factor (exponent) |
|
537 |
+
""")
|
538 |
+
return
|
539 |
+
|
540 |
+
|
541 |
+
@app.cell(hide_code=True)
|
542 |
+
def _(mo):
|
543 |
+
weight_likes = mo.ui.slider(
|
544 |
+
start=0.1,
|
545 |
+
stop=1,
|
546 |
+
value=0.5,
|
547 |
+
step=0.1,
|
548 |
+
show_value=True,
|
549 |
+
label="⭐ Weight for Likes",
|
550 |
+
)
|
551 |
+
weight_forks = mo.ui.slider(
|
552 |
+
start=0.1,
|
553 |
+
stop=1,
|
554 |
+
value=0.3,
|
555 |
+
step=0.1,
|
556 |
+
show_value=True,
|
557 |
+
label="↗️ Weight for Forks",
|
558 |
+
)
|
559 |
+
weight_comments = mo.ui.slider(
|
560 |
+
start=0.1,
|
561 |
+
stop=1,
|
562 |
+
value=0.5,
|
563 |
+
step=0.1,
|
564 |
+
show_value=True,
|
565 |
+
label="💬 Weight for Comments",
|
566 |
+
)
|
567 |
+
non_linear_factor = mo.ui.slider(
|
568 |
+
start=1,
|
569 |
+
stop=2,
|
570 |
+
value=1.2,
|
571 |
+
step=0.1,
|
572 |
+
show_value=True,
|
573 |
+
label="🎢 Non-Linear Factor",
|
574 |
+
)
|
575 |
+
return non_linear_factor, weight_comments, weight_forks, weight_likes
|
576 |
+
|
577 |
+
|
578 |
+
@app.cell(hide_code=True)
|
579 |
+
def _(
|
580 |
+
non_linear_factor,
|
581 |
+
np,
|
582 |
+
numba,
|
583 |
+
weight_comments,
|
584 |
+
weight_forks,
|
585 |
+
weight_likes,
|
586 |
+
):
|
587 |
+
w_l = weight_likes.value
|
588 |
+
w_f = weight_forks.value
|
589 |
+
w_c = weight_comments.value
|
590 |
+
nlf = non_linear_factor.value
|
591 |
+
|
592 |
+
|
593 |
+
@numba.guvectorize(
|
594 |
+
[(numba.int64[:], numba.int64[:], numba.int64[:], numba.float64[:])],
|
595 |
+
"(n), (n), (n) -> (n)",
|
596 |
+
)
|
597 |
+
def weighted_popularity_numba(
|
598 |
+
likes: np.ndarray,
|
599 |
+
forks: np.ndarray,
|
600 |
+
comments: np.ndarray,
|
601 |
+
out: np.ndarray,
|
602 |
+
):
|
603 |
+
for i in range(likes.shape[0]):
|
604 |
+
out[i] = (
|
605 |
+
w_l * (likes[i] ** nlf)
|
606 |
+
+ w_f * (forks[i] ** nlf)
|
607 |
+
+ w_c * (comments[i] ** nlf)
|
608 |
+
)
|
609 |
+
return nlf, w_c, w_f, w_l, weighted_popularity_numba
|
610 |
+
|
611 |
+
|
612 |
+
@app.cell(hide_code=True)
|
613 |
+
def _(mo):
|
614 |
+
mo.md(r"""We apply our JIT-compiled UDF using `map_batches`, as before. The key is that we're passing entire columns directly to `weighted_popularity_numba`. Polars and Numba handle the conversion to NumPy arrays behind the scenes. This direct integration is a major benefit of using `guvectorize`.""")
|
615 |
+
return
|
616 |
+
|
617 |
+
|
618 |
+
@app.cell(hide_code=True)
|
619 |
+
def _(notebook_stats_df, pl, weighted_popularity_numba):
|
620 |
+
notebook_popularity_df = (
|
621 |
+
notebook_stats_df.select(
|
622 |
+
pl.col("notebook_thumbnail_src").alias("thumbnail"),
|
623 |
+
"title",
|
624 |
+
"likes",
|
625 |
+
"forks",
|
626 |
+
"comments",
|
627 |
+
popularity=pl.struct(["likes", "forks", "comments"]).map_batches(
|
628 |
+
lambda obj: weighted_popularity_numba(
|
629 |
+
obj.struct.field("likes"),
|
630 |
+
obj.struct.field("forks"),
|
631 |
+
obj.struct.field("comments"),
|
632 |
+
),
|
633 |
+
return_dtype=pl.Float64,
|
634 |
+
),
|
635 |
+
url="notebook_url",
|
636 |
+
)
|
637 |
+
)
|
638 |
+
return (notebook_popularity_df,)
|
639 |
+
|
640 |
+
|
641 |
+
@app.cell(hide_code=True)
|
642 |
+
def _(mo):
|
643 |
+
mo.callout("💡 Adjust the hyperparameters of the popularity ranking UDF. How do the weights and non-linear factor affect the notebook rankings?")
|
644 |
+
return
|
645 |
+
|
646 |
+
|
647 |
+
@app.cell(hide_code=True)
|
648 |
+
def _(
|
649 |
+
mo,
|
650 |
+
non_linear_factor,
|
651 |
+
notebook_popularity_df,
|
652 |
+
weight_comments,
|
653 |
+
weight_forks,
|
654 |
+
weight_likes,
|
655 |
+
):
|
656 |
+
mo.vstack(
|
657 |
+
[
|
658 |
+
mo.hstack([weight_likes, weight_forks]),
|
659 |
+
mo.hstack([weight_comments, non_linear_factor]),
|
660 |
+
notebook_popularity_df,
|
661 |
+
]
|
662 |
+
)
|
663 |
+
return
|
664 |
+
|
665 |
+
|
666 |
+
@app.cell(hide_code=True)
|
667 |
+
def _(mo):
|
668 |
+
mo.md(r"""As the slope chart below demonstrates, this new ranking strategy significantly changes the notebook order, as it considers forks and comments, not just likes.""")
|
669 |
+
return
|
670 |
+
|
671 |
+
|
672 |
+
@app.cell(hide_code=True)
|
673 |
+
def _(alt, notebook_popularity_df, pl):
|
674 |
+
notebook_ranks_df = (
|
675 |
+
notebook_popularity_df.sort("likes", descending=True)
|
676 |
+
.with_row_index("rank_by_likes")
|
677 |
+
.with_columns(pl.col("rank_by_likes") + 1)
|
678 |
+
.sort("popularity", descending=True)
|
679 |
+
.with_row_index("rank_by_popularity")
|
680 |
+
.with_columns(pl.col("rank_by_popularity") + 1)
|
681 |
+
.select("thumbnail", "title", "rank_by_popularity", "rank_by_likes")
|
682 |
+
.unpivot(
|
683 |
+
["rank_by_popularity", "rank_by_likes"],
|
684 |
+
index="title",
|
685 |
+
variable_name="strategy",
|
686 |
+
value_name="rank",
|
687 |
+
)
|
688 |
+
)
|
689 |
+
|
690 |
+
# Slope chart to visualize rank differences by strategy
|
691 |
+
lines = notebook_ranks_df.plot.line(
|
692 |
+
x="strategy:O",
|
693 |
+
y="rank:Q",
|
694 |
+
color="title:N",
|
695 |
+
)
|
696 |
+
points = notebook_ranks_df.plot.point(
|
697 |
+
x="strategy:O",
|
698 |
+
y="rank:Q",
|
699 |
+
color=alt.Color("title:N", legend=None),
|
700 |
+
fill="title:N",
|
701 |
+
)
|
702 |
+
(points + lines).properties(width=400)
|
703 |
+
return lines, notebook_ranks_df, points
|
704 |
+
|
705 |
+
|
706 |
+
@app.cell(hide_code=True)
|
707 |
+
def _(mo):
|
708 |
+
mo.md(
|
709 |
+
r"""
|
710 |
+
## ⏱️ Quantifying the Overhead
|
711 |
+
|
712 |
+
> UDF Performance Comparison
|
713 |
+
|
714 |
+
To truly understand the performance implications of using UDFs, let's conduct a benchmark. We'll create a DataFrame with random numbers and perform the same numerical operation using four different methods:
|
715 |
+
|
716 |
+
1. **Native Polars:** Using Polars' built-in expressions.
|
717 |
+
2. **`map_elements`:** Applying a Python function element-wise.
|
718 |
+
3. **`map_batches`:** **Applying** a Python function to the entire Series.
|
719 |
+
4. **`map_batches` with Numba:** Applying a JIT-compiled function to batches, similar to a generalized universal function.
|
720 |
+
|
721 |
+
We'll use a simple, but non-trivial, calculation: `result = (x * 2.5 + 5) / (x + 1)`. This involves multiplication, addition, and division, giving us a realistic representation of a common numerical operation. We'll use the `timeit` module, to accurately measure execution times over multiple trials.
|
722 |
+
"""
|
723 |
+
)
|
724 |
+
return
|
725 |
+
|
726 |
+
|
727 |
+
@app.cell(hide_code=True)
|
728 |
+
def _(mo):
|
729 |
+
mo.callout("💡 Tweak the benchmark parameters to explore how execution times change with different sample sizes and trial counts. Do you notice anything surprising as you decrease the number of samples?")
|
730 |
+
return
|
731 |
+
|
732 |
+
|
733 |
+
@app.cell(hide_code=True)
|
734 |
+
def _(benchmark_plot, mo, num_samples, num_trials):
|
735 |
+
mo.vstack(
|
736 |
+
[
|
737 |
+
mo.hstack([num_samples, num_trials]),
|
738 |
+
mo.md(
|
739 |
+
f"""---
|
740 |
+
Performance comparison over **{num_trials.value:,} trials** with **{num_samples.value:,} samples**.
|
741 |
+
|
742 |
+
> Lower execution times are better.
|
743 |
+
"""
|
744 |
+
),
|
745 |
+
benchmark_plot,
|
746 |
+
]
|
747 |
+
)
|
748 |
+
return
|
749 |
+
|
750 |
+
|
751 |
+
@app.cell(hide_code=True)
|
752 |
+
def _(mo):
|
753 |
+
mo.md(
|
754 |
+
r"""
|
755 |
+
As anticipated, the `Batch-Wise UDF (Python)` and `Element-Wise UDF` exhibit significantly worse performance, essentially acting as pure-Python for-each loops.
|
756 |
+
|
757 |
+
However, when Python serves as an interface to lower-level, high-performance libraries, we observe substantial improvements. The `Batch-Wise UDF (NumPy)` lags behind both `Batch-Wise UDF (Numba)` and `Native Polars`, but it still represents a considerable improvement over pure-Python UDFs due to its vectorized computations.
|
758 |
+
|
759 |
+
Numba's Just-In-Time (JIT) compilation delivers a dramatic performance boost, achieving speeds comparable to native Polars expressions. This demonstrates that UDFs, particularly when combined with tools like Numba, don't inevitably lead to bottlenecks in numerical computations.
|
760 |
+
"""
|
761 |
+
)
|
762 |
+
return
|
763 |
+
|
764 |
+
|
765 |
+
@app.cell(hide_code=True)
|
766 |
+
def _(mo):
|
767 |
+
num_samples = mo.ui.slider(
|
768 |
+
start=1_000,
|
769 |
+
stop=1_000_000,
|
770 |
+
value=250_000,
|
771 |
+
step=1000,
|
772 |
+
show_value=True,
|
773 |
+
debounce=True,
|
774 |
+
label="Number of Samples",
|
775 |
+
)
|
776 |
+
num_trials = mo.ui.slider(
|
777 |
+
start=50,
|
778 |
+
stop=1_000,
|
779 |
+
value=100,
|
780 |
+
step=50,
|
781 |
+
show_value=True,
|
782 |
+
debounce=True,
|
783 |
+
label="Number of Trials",
|
784 |
+
)
|
785 |
+
return num_samples, num_trials
|
786 |
+
|
787 |
+
|
788 |
+
@app.cell(hide_code=True)
|
789 |
+
def _(np, num_samples, pl):
|
790 |
+
rng = np.random.default_rng(42)
|
791 |
+
sample_df = pl.from_dict({"x": rng.random(num_samples.value)})
|
792 |
+
return rng, sample_df
|
793 |
+
|
794 |
+
|
795 |
+
@app.cell(hide_code=True)
|
796 |
+
def _(np, num_trials, numba, pl, sample_df, timeit):
|
797 |
+
def run_native():
|
798 |
+
sample_df.with_columns(
|
799 |
+
result_native=(pl.col("x") * 2.5 + 5) / (pl.col("x") + 1)
|
800 |
+
)
|
801 |
+
|
802 |
+
|
803 |
+
def _calculate_elementwise(x: float) -> float:
|
804 |
+
return (x * 2.5 + 5) / (x + 1)
|
805 |
+
|
806 |
+
|
807 |
+
def run_map_elements():
|
808 |
+
sample_df.with_columns(
|
809 |
+
result_map_elements=pl.col("x").map_elements(
|
810 |
+
_calculate_elementwise,
|
811 |
+
return_dtype=pl.Float64,
|
812 |
+
)
|
813 |
+
)
|
814 |
+
|
815 |
+
|
816 |
+
def _calculate_batchwise_numpy(x_series: pl.Series) -> pl.Series:
|
817 |
+
x_array = x_series.to_numpy()
|
818 |
+
result_array = (x_array * 2.5 + 5) / (x_array + 1)
|
819 |
+
return pl.Series(result_array)
|
820 |
+
|
821 |
+
|
822 |
+
def run_map_batches_numpy():
|
823 |
+
sample_df.with_columns(
|
824 |
+
result_map_batches_numpy=pl.col("x").map_batches(
|
825 |
+
_calculate_batchwise_numpy,
|
826 |
+
return_dtype=pl.Float64,
|
827 |
+
)
|
828 |
+
)
|
829 |
+
|
830 |
+
|
831 |
+
def _calculate_batchwise_python(x_series: pl.Series) -> pl.Series:
|
832 |
+
x_array = x_series.to_list()
|
833 |
+
result_array = [_calculate_elementwise(x) for x in x_array]
|
834 |
+
return pl.Series(result_array)
|
835 |
+
|
836 |
+
|
837 |
+
def run_map_batches_python():
|
838 |
+
sample_df.with_columns(
|
839 |
+
result_map_batches_python=pl.col("x").map_batches(
|
840 |
+
_calculate_batchwise_python,
|
841 |
+
return_dtype=pl.Float64,
|
842 |
+
)
|
843 |
+
)
|
844 |
+
|
845 |
+
|
846 |
+
@numba.guvectorize([(numba.float64[:], numba.float64[:])], "(n) -> (n)")
|
847 |
+
def _calculate_batchwise_numba(x: np.ndarray, out: np.ndarray):
|
848 |
+
for i in range(x.shape[0]):
|
849 |
+
out[i] = (x[i] * 2.5 + 5) / (x[i] + 1)
|
850 |
+
|
851 |
+
|
852 |
+
def run_map_batches_numba():
|
853 |
+
sample_df.with_columns(
|
854 |
+
result_map_batches_numba=pl.col("x").map_batches(
|
855 |
+
_calculate_batchwise_numba,
|
856 |
+
return_dtype=pl.Float64,
|
857 |
+
)
|
858 |
+
)
|
859 |
+
|
860 |
+
|
861 |
+
def time_method(callable_name: str, number=num_trials.value) -> float:
|
862 |
+
fn = globals()[callable_name]
|
863 |
+
return timeit.timeit(fn, number=number)
|
864 |
+
return (
|
865 |
+
run_map_batches_numba,
|
866 |
+
run_map_batches_numpy,
|
867 |
+
run_map_batches_python,
|
868 |
+
run_map_elements,
|
869 |
+
run_native,
|
870 |
+
time_method,
|
871 |
+
)
|
872 |
+
|
873 |
+
|
874 |
+
@app.cell(hide_code=True)
|
875 |
+
def _(alt, pl, time_method):
|
876 |
+
benchmark_df = pl.from_dicts(
|
877 |
+
[
|
878 |
+
{
|
879 |
+
"title": "Native Polars",
|
880 |
+
"callable_name": "run_native",
|
881 |
+
},
|
882 |
+
{
|
883 |
+
"title": "Element-Wise UDF",
|
884 |
+
"callable_name": "run_map_elements",
|
885 |
+
},
|
886 |
+
{
|
887 |
+
"title": "Batch-Wise UDF (NumPy)",
|
888 |
+
"callable_name": "run_map_batches_numpy",
|
889 |
+
},
|
890 |
+
{
|
891 |
+
"title": "Batch-Wise UDF (Python)",
|
892 |
+
"callable_name": "run_map_batches_python",
|
893 |
+
},
|
894 |
+
{
|
895 |
+
"title": "Batch-Wise UDF (Numba)",
|
896 |
+
"callable_name": "run_map_batches_numba",
|
897 |
+
},
|
898 |
+
]
|
899 |
+
).with_columns(
|
900 |
+
time=pl.col("callable_name").map_elements(
|
901 |
+
time_method, return_dtype=pl.Float64
|
902 |
+
)
|
903 |
+
)
|
904 |
+
|
905 |
+
benchmark_plot = benchmark_df.plot.bar(
|
906 |
+
x=alt.X("title:N", title="Method", sort="-y"),
|
907 |
+
y=alt.Y("time:Q", title="Execution Time (s)", axis=alt.Axis(format=".3f")),
|
908 |
+
).properties(width=400)
|
909 |
+
return benchmark_df, benchmark_plot
|
910 |
+
|
911 |
+
|
912 |
+
@app.cell(hide_code=True)
|
913 |
+
def _():
|
914 |
+
import asyncio
|
915 |
+
import timeit
|
916 |
+
from typing import Iterable
|
917 |
+
|
918 |
+
import altair as alt
|
919 |
+
import httpx
|
920 |
+
import marimo as mo
|
921 |
+
import nest_asyncio
|
922 |
+
import numba
|
923 |
+
import numpy as np
|
924 |
+
from bs4 import BeautifulSoup
|
925 |
+
|
926 |
+
import polars as pl
|
927 |
+
|
928 |
+
# Fixes RuntimeError: asyncio.run() cannot be called from a running event loop
|
929 |
+
nest_asyncio.apply()
|
930 |
+
return (
|
931 |
+
BeautifulSoup,
|
932 |
+
Iterable,
|
933 |
+
alt,
|
934 |
+
asyncio,
|
935 |
+
httpx,
|
936 |
+
mo,
|
937 |
+
nest_asyncio,
|
938 |
+
np,
|
939 |
+
numba,
|
940 |
+
pl,
|
941 |
+
timeit,
|
942 |
+
)
|
943 |
+
|
944 |
+
|
945 |
+
if __name__ == "__main__":
|
946 |
+
app.run()
|
polars/README.md
CHANGED
@@ -23,3 +23,4 @@ You can also open notebooks in our online playground by appending marimo.app/ to
|
|
23 |
Thanks to all our notebook authors!
|
24 |
|
25 |
* [Koushik Khan](https://github.com/koushikkhan)
|
|
|
|
23 |
Thanks to all our notebook authors!
|
24 |
|
25 |
* [Koushik Khan](https://github.com/koushikkhan)
|
26 |
+
* [Péter Gyarmati](https://github.com/peter-gy)
|
probability/08_bayes_theorem.py
CHANGED
@@ -307,7 +307,7 @@ def _(mo):
|
|
307 |
mo.md(
|
308 |
r"""
|
309 |
|
310 |
-
_This interactive
|
311 |
|
312 |
Bayes theorem provides a convenient way to calculate the probability
|
313 |
of a hypothesis event $H$ given evidence $E$:
|
|
|
307 |
mo.md(
|
308 |
r"""
|
309 |
|
310 |
+
_This interactive example was made with [marimo](https://github.com/marimo-team/marimo/blob/main/examples/misc/bayes_theorem.py), and is [based on an explanation of Bayes' Theorem by Grant Sanderson](https://www.youtube.com/watch?v=HZGCoVF3YvM&list=PLzq7odmtfKQw2KIbQq0rzWrqgifHKkPG1&index=1&t=3s)_.
|
311 |
|
312 |
Bayes theorem provides a convenient way to calculate the probability
|
313 |
of a hypothesis event $H$ given evidence $E$:
|