Spaces:

marimo-team
/

marimo-learn

Running

App Files Files Community

Joram Mutenge commited on Mar 1

Commit

4c17152

1 Parent(s): cbef791

notebook on basic operations in polars

Browse files

Files changed (1) hide show

polars/04_basic_operations.py +623 -0

polars/04_basic_operations.py ADDED Viewed

	@@ -0,0 +1,623 @@

+import marimo
+__generated_with = "0.11.13"
+app = marimo.App(width="medium")
+@app.cell
+def _():
+    import marimo as mo
+    return (mo,)
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        # Basic operations on data
+        _By [Joram Mutenge](https://www.udemy.com/user/joram-mutenge/)._
+        In this notebook, you'll learn how to perform arithmetic operations, comparisons, and conditionals on a Polars dataframe. We'll work with a DataFrame that tracks software usage by year, categorized as either Vintage (old) or Modern (new).
+        """
+    )
+    return
+@app.cell
+def _():
+    import polars as pl
+    df = pl.DataFrame(
+        {
+            "software": [
+                "Lotus-123",
+                "WordStar",
+                "dBase III",
+                "VisiCalc",
+                "WinZip",
+                "MS-DOS",
+                "HyperCard",
+                "WordPerfect",
+                "Excel",
+                "Photoshop",
+                "Visual Studio",
+                "Slack",
+                "Zoom",
+                "Notion",
+                "Figma",
+                "Spotify",
+                "VSCode",
+                "Docker",
+            ],
+            "users": [
+                10000,
+                4500,
+                2500,
+                3000,
+                1800,
+                17000,
+                2200,
+                1900,
+                500000,
+                12000000,
+                1500000,
+                3000000,
+                4000000,
+                2000000,
+                2500000,
+                4500000,
+                6000000,
+                3500000,
+            ],
+            "category": ["Vintage"] * 8 + ["Modern"] * 10,
+            "year": [
+                1985,
+                1980,
+                1984,
+                1979,
+                1991,
+                1981,
+                1987,
+                1982,
+                1987,
+                1990,
+                1997,
+                2013,
+                2011,
+                2016,
+                2016,
+                2008,
+                2015,
+                2013,
+            ],
+        }
+    )
+    df
+    return df, pl
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ## Arithmetic
+        ### Addition
+        Let's add 42 users to each piece of software. This means adding 42 to each value under **users**.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users") + 42)
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Another way to perform the above operation is using the built-in function.""")
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users").add(42))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ### Subtraction
+        Let's subtract 42 users to each piece of software.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users") - 42)
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Alternatively, you could subtract like this:""")
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users").sub(42))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ### Division
+        Suppose the **users** values are inflated, we can reduce them by dividing by 1000. Here's how to do it.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users") / 1000)
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Or we could do it with a built-in expression.""")
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users").truediv(1000))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""If we didn't care about the remainder after division (i.e remove numbers after decimal point) we could do it like this.""")
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users").floordiv(1000))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ### Multiplication
+        Let's pretend the *user* values are deflated and increase them by multiplying by 100.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (df.with_columns(pl.col("users") * 100))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Polars also has a built-in function for multiplication.""")
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns(pl.col("users").mul(100))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""So far, we've only modified the values in an existing column. Let's create a column **decade** that will represent the years as decades. Thus 1985 will be 1980 and 2008 will be 2000.""")
+    return
+@app.cell
+def _(df, pl):
+    (df.with_columns(decade=pl.col("year").floordiv(10).mul(10)))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""We could create a new column another way as follows:""")
+    return
+@app.cell
+def _(df, pl):
+    df.with_columns((pl.col("year").floordiv(10).mul(10)).alias("decade"))
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        **Tip**
+        Polars encounrages you to perform your operations as a chain. This enables you to take advantage of the query optimizer. We'll build upon the above code as a chain.
+        ## Comparison
+        ### Equal
+        Let's get all the software categorized as Vintage.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (
+        df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
+        .filter(pl.col("category") == "Vintage")
+    )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""We could also do a double comparison. VisiCal is the only software that's vintage and in the decade 1970s. Let's perform this comparison operation.""")
+    return
+@app.cell
+def _(df, pl):
+    (
+        df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
+        .filter(pl.col("category") == "Vintage")
+        .filter(pl.col("decade") == 1970)
+    )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        We could also do this comparison in one line, if readability is not a concern
+        **Notice** that we must enclose the two expressions between the `&` with parenthesis.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (
+        df.with_columns(decade=pl.col("year").floordiv(10).mul(10))
+        .filter((pl.col("category") == "Vintage") & (pl.col("decade") == 1970))
+    )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""We can also use the built-in function for equal to comparisons.""")
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(pl.col('category').eq('Vintage'))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ### Not equal
+        We can also compare if something is `not` equal to something. In this case, category is not vintage.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(pl.col('category') != 'Vintage')
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Or with the built-in function.""")
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(pl.col('category').ne('Vintage'))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Or if you want to be extra clever, you can use the negation symbol `~` used in logic.""")
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(~pl.col('category').eq('Vintage'))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ### Greater than
+        Let's get the software where the year is greater than 2008 from the above dataframe.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(~pl.col('category').eq('Vintage'))
+     .filter(pl.col('year') > 2008)
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Or if we wanted the year 2008 to be included, we could use great or equal to.""")
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(~pl.col('category').eq('Vintage'))
+     .filter(pl.col('year') >= 2008)
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""We could do the previous two operations with built-in functions. Here's with greater than.""")
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(~pl.col('category').eq('Vintage'))
+     .filter(pl.col('year').gt(2008))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""And here's with greater or equal to""")
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(~pl.col('category').eq('Vintage'))
+     .filter(pl.col('year').ge(2008))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        **Note**: For "less than", and "less or equal to" you can use the operators `<` or `<=`. Alternatively, you can use built-in functions `lt` or `le` respectively.
+        ### Is between
+        Polars also allows us to filter between a range of values. Let's get the modern software were the year is between 2013 and 2016. This is inclusive on both ends (i.e. both years are part of the result).
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(pl.col('category').eq('Modern'))
+     .filter(pl.col('year').is_between(2013, 2016))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ### Or operator
+        If we only want either one of the conditions in the comparison to be met, we could use `|`, which is the `or` operator.
+        Let's get software that is either modern or used in the decade 1980s.
+        """
+    )
+    return
+@app.cell
+def _(df, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter((pl.col('category') == 'Modern') | (pl.col('decade') == 1980))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ## Conditionals
+        Polars also allows you create new columns based on a condition. Let's create a column *status* that will indicate if the software is "discontinued" or "in use".
+        Here's a list of products that are no longer in use.
+        """
+    )
+    return
+@app.cell
+def _():
+    discontinued_list = ['Lotus-123', 'WordStar', 'dBase III', 'VisiCalc', 'MS-DOS', 'HyperCard']
+    return (discontinued_list,)
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Here's how we can get a dataframe of the products that are discontinued.""")
+    return
+@app.cell
+def _(df, discontinued_list, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .filter(pl.col('software').is_in(discontinued_list))
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Now, let's create the *status* column.""")
+    return
+@app.cell
+def _(df, discontinued_list, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
+                   .then(pl.lit('Discontinued'))
+                   .otherwise(pl.lit('In use'))
+                   .alias('status')
+                   )
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(
+        r"""
+        ## Unique counts
+        Sometimes you may want to see only the unique values in a column. Let's check the unique decades we have in our DataFrame.
+        """
+    )
+    return
+@app.cell
+def _(df, discontinued_list, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
+                   .then(pl.lit('Discontinued'))
+                   .otherwise(pl.lit('In use'))
+                   .alias('status')
+                   )
+     .select('decade').unique()
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Finally, let's find out the number of software used in each decade.""")
+    return
+@app.cell
+def _(df, discontinued_list, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
+                   .then(pl.lit('Discontinued'))
+                   .otherwise(pl.lit('In use'))
+                   .alias('status')
+                   )
+     ['decade'].value_counts()
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""We could also rewrite the above code as follows:""")
+    return
+@app.cell
+def _(df, discontinued_list, pl):
+    (df
+     .with_columns(decade=pl.col('year').floordiv(10).mul(10))
+     .with_columns(pl.when(pl.col('software').is_in(discontinued_list))
+                   .then(pl.lit('Discontinued'))
+                   .otherwise(pl.lit('In use'))
+                   .alias('status')
+                   )
+     .select('decade').to_series().value_counts()
+     )
+    return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Hopefully, we've picked your interest to try out Polars the next time you analyze your data.""")
+    return
+@app.cell
+def _():
+    return
+if __name__ == "__main__":
+    app.run()