marimo-learn / polars /04_basic_operations.py
Joram Mutenge
run notebook in sandbox
f57b8b5
raw
history blame
13.8 kB
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "marimo",
# "polars==1.23.0",
# ]
# ///
import marimo
__generated_with = "0.11.13"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
return (mo,)
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
# Basic operations on data
_By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._
In this notebook, you'll learn how to perform arithmetic operations, comparisons, and conditionals on a Polars dataframe. We'll work with a DataFrame that tracks software usage by year, categorized as either Vintage (old) or Modern (new).
"""
)
return
@app.cell
def _():
import polars as pl
df = pl.DataFrame(
{
"software": [
"Lotus-123",
"WordStar",
"dBase III",
"VisiCalc",
"WinZip",
"MS-DOS",
"HyperCard",
"WordPerfect",
"Excel",
"Photoshop",
"Visual Studio",
"Slack",
"Zoom",
"Notion",
"Figma",
"Spotify",
"VSCode",
"Docker",
],
"users": [
10000,
4500,
2500,
3000,
1800,
17000,
2200,
1900,
500000,
12000000,
1500000,
3000000,
4000000,
2000000,
2500000,
4500000,
6000000,
3500000,
],
"category": ["Vintage"] * 8 + ["Modern"] * 10,
"year": [
1985,
1980,
1984,
1979,
1991,
1981,
1987,
1982,
1987,
1990,
1997,
2013,
2011,
2016,
2016,
2008,
2015,
2013,
],
}
)
df
return df, pl
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
## Arithmetic
### Addition
Let's add 42 users to each piece of software. This means adding 42 to each value under **users**.
"""
)
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users") + 42)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Another way to perform the above operation is using the built-in function.""")
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users").add(42))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
### Subtraction
Let's subtract 42 users to each piece of software.
"""
)
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users") - 42)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Alternatively, you could subtract like this:""")
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users").sub(42))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
### Division
Suppose the **users** values are inflated, we can reduce them by dividing by 1000. Here's how to do it.
"""
)
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users") / 1000)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Or we could do it with a built-in expression.""")
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users").truediv(1000))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""If we didn't care about the remainder after division (i.e remove numbers after decimal point) we could do it like this.""")
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users").floordiv(1000))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
### Multiplication
Let's pretend the *user* values are deflated and increase them by multiplying by 100.
"""
)
return
@app.cell
def _(df, pl):
(df.with_columns(pl.col("users") * 100))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Polars also has a built-in function for multiplication.""")
return
@app.cell
def _(df, pl):
df.with_columns(pl.col("users").mul(100))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""So far, we've only modified the values in an existing column. Let's create a column **decade** that will represent the years as decades. Thus 1985 will be 1980 and 2008 will be 2000.""")
return
@app.cell
def _(df, pl):
(df.with_columns(decade=pl.col("year").floordiv(10).mul(10)))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We could create a new column another way as follows:""")
return
@app.cell
def _(df, pl):
df.with_columns((pl.col("year").floordiv(10).mul(10)).alias("decade"))
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
**Tip**
Polars encounrages you to perform your operations as a chain. This enables you to take advantage of the query optimizer. We'll build upon the above code as a chain.
## Comparison
### Equal
Let's get all the software categorized as Vintage.
"""
)
return
@app.cell
def _(df, pl):
(
df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
.filter(pl.col("category") == "Vintage")
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We could also do a double comparison. VisiCal is the only software that's vintage and in the decade 1970s. Let's perform this comparison operation.""")
return
@app.cell
def _(df, pl):
(
df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
.filter(pl.col("category") == "Vintage")
.filter(pl.col("decade") == 1970)
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
We could also do this comparison in one line, if readability is not a concern
**Notice** that we must enclose the two expressions between the `&` with parenthesis.
"""
)
return
@app.cell
def _(df, pl):
(
df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
.filter((pl.col("category") == "Vintage") & (pl.col("decade") == 1970))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We can also use the built-in function for equal to comparisons.""")
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(pl.col('category').eq('Vintage'))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
### Not equal
We can also compare if something is `not` equal to something. In this case, category is not vintage.
"""
)
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(pl.col('category') != 'Vintage')
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Or with the built-in function.""")
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(pl.col('category').ne('Vintage'))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Or if you want to be extra clever, you can use the negation symbol `~` used in logic.""")
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(~pl.col('category').eq('Vintage'))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
### Greater than
Let's get the software where the year is greater than 2008 from the above dataframe.
"""
)
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(~pl.col('category').eq('Vintage'))
.filter(pl.col('year') > 2008)
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Or if we wanted the year 2008 to be included, we could use great or equal to.""")
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(~pl.col('category').eq('Vintage'))
.filter(pl.col('year') >= 2008)
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We could do the previous two operations with built-in functions. Here's with greater than.""")
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(~pl.col('category').eq('Vintage'))
.filter(pl.col('year').gt(2008))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""And here's with greater or equal to""")
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(~pl.col('category').eq('Vintage'))
.filter(pl.col('year').ge(2008))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
**Note**: For "less than", and "less or equal to" you can use the operators `<` or `<=`. Alternatively, you can use built-in functions `lt` or `le` respectively.
### Is between
Polars also allows us to filter between a range of values. Let's get the modern software were the year is between 2013 and 2016. This is inclusive on both ends (i.e. both years are part of the result).
"""
)
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(pl.col('category').eq('Modern'))
.filter(pl.col('year').is_between(2013, 2016))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
### Or operator
If we only want either one of the conditions in the comparison to be met, we could use `|`, which is the `or` operator.
Let's get software that is either modern or used in the decade 1980s.
"""
)
return
@app.cell
def _(df, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter((pl.col('category') == 'Modern') | (pl.col('decade') == 1980))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
## Conditionals
Polars also allows you create new columns based on a condition. Let's create a column *status* that will indicate if the software is "discontinued" or "in use".
Here's a list of products that are no longer in use.
"""
)
return
@app.cell
def _():
discontinued_list = ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 'MS-DOS', 'HyperCard']
return (discontinued_list,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Here's how we can get a dataframe of the products that are discontinued.""")
return
@app.cell
def _(df, discontinued_list, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.filter(pl.col('software').is_in(discontinued_list))
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Now, let's create the **status** column.""")
return
@app.cell
def _(df, discontinued_list, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
.then(pl.lit('Discontinued'))
.otherwise(pl.lit('In use'))
.alias('status')
)
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
## Unique counts
Sometimes you may want to see only the unique values in a column. Let's check the unique decades we have in our DataFrame.
"""
)
return
@app.cell
def _(df, discontinued_list, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
.then(pl.lit('Discontinued'))
.otherwise(pl.lit('In use'))
.alias('status')
)
.select('decade').unique()
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Finally, let's find out the number of software used in each decade.""")
return
@app.cell
def _(df, discontinued_list, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
.then(pl.lit('Discontinued'))
.otherwise(pl.lit('In use'))
.alias('status')
)
['decade'].value_counts()
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We could also rewrite the above code as follows:""")
return
@app.cell
def _(df, discontinued_list, pl):
(df
.with_columns(decade=pl.col('year').floordiv(10).mul(10))
.with_columns(pl.when(pl.col('software').is_in(discontinued_list))
.then(pl.lit('Discontinued'))
.otherwise(pl.lit('In use'))
.alias('status')
)
.select('decade').to_series().value_counts()
)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Hopefully, we've picked your interest to try out Polars the next time you analyze your data.""")
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()