Spaces:
Sleeping
Sleeping
# /// script | |
# requires-python = ">=3.13" | |
# dependencies = [ | |
# "marimo", | |
# "polars==1.23.0", | |
# ] | |
# /// | |
import marimo | |
__generated_with = "0.11.13" | |
app = marimo.App(width="medium") | |
def _(): | |
import marimo as mo | |
return (mo,) | |
def _(mo): | |
mo.md( | |
r""" | |
# Basic operations on data | |
_By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._ | |
In this notebook, you'll learn how to perform arithmetic operations, comparisons, and conditionals on a Polars dataframe. We'll work with a DataFrame that tracks software usage by year, categorized as either Vintage (old) or Modern (new). | |
""" | |
) | |
return | |
def _(): | |
import polars as pl | |
df = pl.DataFrame( | |
{ | |
"software": [ | |
"Lotus-123", | |
"WordStar", | |
"dBase III", | |
"VisiCalc", | |
"WinZip", | |
"MS-DOS", | |
"HyperCard", | |
"WordPerfect", | |
"Excel", | |
"Photoshop", | |
"Visual Studio", | |
"Slack", | |
"Zoom", | |
"Notion", | |
"Figma", | |
"Spotify", | |
"VSCode", | |
"Docker", | |
], | |
"users": [ | |
10000, | |
4500, | |
2500, | |
3000, | |
1800, | |
17000, | |
2200, | |
1900, | |
500000, | |
12000000, | |
1500000, | |
3000000, | |
4000000, | |
2000000, | |
2500000, | |
4500000, | |
6000000, | |
3500000, | |
], | |
"category": ["Vintage"] * 8 + ["Modern"] * 10, | |
"year": [ | |
1985, | |
1980, | |
1984, | |
1979, | |
1991, | |
1981, | |
1987, | |
1982, | |
1987, | |
1990, | |
1997, | |
2013, | |
2011, | |
2016, | |
2016, | |
2008, | |
2015, | |
2013, | |
], | |
} | |
) | |
df | |
return df, pl | |
def _(mo): | |
mo.md( | |
r""" | |
## Arithmetic | |
### Addition | |
Let's add 42 users to each piece of software. This means adding 42 to each value under **users**. | |
""" | |
) | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users") + 42) | |
return | |
def _(mo): | |
mo.md(r"""Another way to perform the above operation is using the built-in function.""") | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users").add(42)) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
### Subtraction | |
Let's subtract 42 users to each piece of software. | |
""" | |
) | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users") - 42) | |
return | |
def _(mo): | |
mo.md(r"""Alternatively, you could subtract like this:""") | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users").sub(42)) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
### Division | |
Suppose the **users** values are inflated, we can reduce them by dividing by 1000. Here's how to do it. | |
""" | |
) | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users") / 1000) | |
return | |
def _(mo): | |
mo.md(r"""Or we could do it with a built-in expression.""") | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users").truediv(1000)) | |
return | |
def _(mo): | |
mo.md(r"""If we didn't care about the remainder after division (i.e remove numbers after decimal point) we could do it like this.""") | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users").floordiv(1000)) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
### Multiplication | |
Let's pretend the *user* values are deflated and increase them by multiplying by 100. | |
""" | |
) | |
return | |
def _(df, pl): | |
(df.with_columns(pl.col("users") * 100)) | |
return | |
def _(mo): | |
mo.md(r"""Polars also has a built-in function for multiplication.""") | |
return | |
def _(df, pl): | |
df.with_columns(pl.col("users").mul(100)) | |
return | |
def _(mo): | |
mo.md(r"""So far, we've only modified the values in an existing column. Let's create a column **decade** that will represent the years as decades. Thus 1985 will be 1980 and 2008 will be 2000.""") | |
return | |
def _(df, pl): | |
(df.with_columns(decade=pl.col("year").floordiv(10).mul(10))) | |
return | |
def _(mo): | |
mo.md(r"""We could create a new column another way as follows:""") | |
return | |
def _(df, pl): | |
df.with_columns((pl.col("year").floordiv(10).mul(10)).alias("decade")) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
**Tip** | |
Polars encounrages you to perform your operations as a chain. This enables you to take advantage of the query optimizer. We'll build upon the above code as a chain. | |
## Comparison | |
### Equal | |
Let's get all the software categorized as Vintage. | |
""" | |
) | |
return | |
def _(df, pl): | |
( | |
df.with_columns(decade=pl.col("year").floordiv(10).mul(10)) | |
.filter(pl.col("category") == "Vintage") | |
) | |
return | |
def _(mo): | |
mo.md(r"""We could also do a double comparison. VisiCal is the only software that's vintage and in the decade 1970s. Let's perform this comparison operation.""") | |
return | |
def _(df, pl): | |
( | |
df.with_columns(decade=pl.col("year").floordiv(10).mul(10)) | |
.filter(pl.col("category") == "Vintage") | |
.filter(pl.col("decade") == 1970) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
We could also do this comparison in one line, if readability is not a concern | |
**Notice** that we must enclose the two expressions between the `&` with parenthesis. | |
""" | |
) | |
return | |
def _(df, pl): | |
( | |
df.with_columns(decade=pl.col("year").floordiv(10).mul(10)) | |
.filter((pl.col("category") == "Vintage") & (pl.col("decade") == 1970)) | |
) | |
return | |
def _(mo): | |
mo.md(r"""We can also use the built-in function for equal to comparisons.""") | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(pl.col('category').eq('Vintage')) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
### Not equal | |
We can also compare if something is `not` equal to something. In this case, category is not vintage. | |
""" | |
) | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(pl.col('category') != 'Vintage') | |
) | |
return | |
def _(mo): | |
mo.md(r"""Or with the built-in function.""") | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(pl.col('category').ne('Vintage')) | |
) | |
return | |
def _(mo): | |
mo.md(r"""Or if you want to be extra clever, you can use the negation symbol `~` used in logic.""") | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(~pl.col('category').eq('Vintage')) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
### Greater than | |
Let's get the software where the year is greater than 2008 from the above dataframe. | |
""" | |
) | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(~pl.col('category').eq('Vintage')) | |
.filter(pl.col('year') > 2008) | |
) | |
return | |
def _(mo): | |
mo.md(r"""Or if we wanted the year 2008 to be included, we could use great or equal to.""") | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(~pl.col('category').eq('Vintage')) | |
.filter(pl.col('year') >= 2008) | |
) | |
return | |
def _(mo): | |
mo.md(r"""We could do the previous two operations with built-in functions. Here's with greater than.""") | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(~pl.col('category').eq('Vintage')) | |
.filter(pl.col('year').gt(2008)) | |
) | |
return | |
def _(mo): | |
mo.md(r"""And here's with greater or equal to""") | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(~pl.col('category').eq('Vintage')) | |
.filter(pl.col('year').ge(2008)) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
**Note**: For "less than", and "less or equal to" you can use the operators `<` or `<=`. Alternatively, you can use built-in functions `lt` or `le` respectively. | |
### Is between | |
Polars also allows us to filter between a range of values. Let's get the modern software were the year is between 2013 and 2016. This is inclusive on both ends (i.e. both years are part of the result). | |
""" | |
) | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(pl.col('category').eq('Modern')) | |
.filter(pl.col('year').is_between(2013, 2016)) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
### Or operator | |
If we only want either one of the conditions in the comparison to be met, we could use `|`, which is the `or` operator. | |
Let's get software that is either modern or used in the decade 1980s. | |
""" | |
) | |
return | |
def _(df, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter((pl.col('category') == 'Modern') | (pl.col('decade') == 1980)) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
## Conditionals | |
Polars also allows you create new columns based on a condition. Let's create a column *status* that will indicate if the software is "discontinued" or "in use". | |
Here's a list of products that are no longer in use. | |
""" | |
) | |
return | |
def _(): | |
discontinued_list = ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 'MS-DOS', 'HyperCard'] | |
return (discontinued_list,) | |
def _(mo): | |
mo.md(r"""Here's how we can get a dataframe of the products that are discontinued.""") | |
return | |
def _(df, discontinued_list, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.filter(pl.col('software').is_in(discontinued_list)) | |
) | |
return | |
def _(mo): | |
mo.md(r"""Now, let's create the **status** column.""") | |
return | |
def _(df, discontinued_list, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
.then(pl.lit('Discontinued')) | |
.otherwise(pl.lit('In use')) | |
.alias('status') | |
) | |
) | |
return | |
def _(mo): | |
mo.md( | |
r""" | |
## Unique counts | |
Sometimes you may want to see only the unique values in a column. Let's check the unique decades we have in our DataFrame. | |
""" | |
) | |
return | |
def _(df, discontinued_list, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
.then(pl.lit('Discontinued')) | |
.otherwise(pl.lit('In use')) | |
.alias('status') | |
) | |
.select('decade').unique() | |
) | |
return | |
def _(mo): | |
mo.md(r"""Finally, let's find out the number of software used in each decade.""") | |
return | |
def _(df, discontinued_list, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
.then(pl.lit('Discontinued')) | |
.otherwise(pl.lit('In use')) | |
.alias('status') | |
) | |
['decade'].value_counts() | |
) | |
return | |
def _(mo): | |
mo.md(r"""We could also rewrite the above code as follows:""") | |
return | |
def _(df, discontinued_list, pl): | |
(df | |
.with_columns(decade=pl.col('year').floordiv(10).mul(10)) | |
.with_columns(pl.when(pl.col('software').is_in(discontinued_list)) | |
.then(pl.lit('Discontinued')) | |
.otherwise(pl.lit('In use')) | |
.alias('status') | |
) | |
.select('decade').to_series().value_counts() | |
) | |
return | |
def _(mo): | |
mo.md(r"""Hopefully, we've picked your interest to try out Polars the next time you analyze your data.""") | |
return | |
def _(): | |
return | |
if __name__ == "__main__": | |
app.run() | |