Spaces:
Sleeping
Sleeping
Srihari Thyagarajan
commited on
fixes
Browse files- polars/01_why_polars.py +49 -35
polars/01_why_polars.py
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
|
10 |
import marimo
|
11 |
|
12 |
-
__generated_with = "0.11.
|
13 |
app = marimo.App(width="medium")
|
14 |
|
15 |
|
@@ -19,19 +19,41 @@ def _():
|
|
19 |
return (mo,)
|
20 |
|
21 |
|
22 |
-
@app.cell
|
23 |
def _(mo):
|
24 |
mo.md(
|
25 |
"""
|
26 |
# An introduction to Polars
|
27 |
|
28 |
This notebook provides a birds-eye overview of [Polars](https://pola.rs/), a fast and user-friendly data manipulation library for Python, and compares it to alternatives like Pandas and PySpark.
|
29 |
-
|
30 |
-
Like Pandas and PySpark, the central data structure in Polars is **the DataFrame**, a tabular data structure consisting of named columns. For example, the next cell constructs a DataFrame that records the gender, age, and height in centimeters for a number of individuals.
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
Polars' performance is due to a number of factors, including its implementation and rust and its ability to perform operations in a parallelized and vectorized manner. It supports a wide range of data types, advanced query optimizations, and seamless integration with other Python libraries, making it a versatile tool for data scientists, engineers, and analysts. Additionally, Polars provides a lazy API for deferred execution, allowing users to optimize their workflows by chaining operations and executing them in a single pass.
|
37 |
|
@@ -41,27 +63,26 @@ def _(mo):
|
|
41 |
return
|
42 |
|
43 |
|
44 |
-
@app.cell
|
45 |
def _(mo):
|
46 |
mo.md(
|
47 |
"""
|
48 |
## Choosing Polars over Pandas
|
49 |
|
50 |
-
|
51 |
In this section we'll give a few reasons why Polars is a better choice than Pandas, along with examples.
|
52 |
"""
|
53 |
)
|
54 |
return
|
55 |
|
56 |
|
57 |
-
@app.cell
|
58 |
def _(mo):
|
59 |
mo.md(
|
60 |
"""
|
61 |
### Intuitive syntax
|
62 |
|
63 |
Polars' syntax is similar to PySpark and intuitive like SQL, making heavy use of **method chaining**. This makes it easy for data professionals to transition to Polars, and leads to an API that is more concise and readable than Pandas.
|
64 |
-
|
65 |
**Example.** In the next few cells, we contrast the code to perform a basic filter and aggregation of data with Pandas to the code required to accomplish the same task with `Polars`.
|
66 |
"""
|
67 |
)
|
@@ -92,21 +113,15 @@ def _():
|
|
92 |
return df_pd, filtered_df_pd, pd, result_pd
|
93 |
|
94 |
|
95 |
-
@app.cell
|
96 |
def _(mo):
|
97 |
-
mo.md(
|
98 |
-
r"""
|
99 |
-
The same example can be worked out in Polars more concisely, using method chaining. Notice how the Polars code is essentially as readable as English.
|
100 |
-
"""
|
101 |
-
)
|
102 |
return
|
103 |
|
104 |
|
105 |
@app.cell
|
106 |
-
def _():
|
107 |
-
|
108 |
-
|
109 |
-
df_pl = pl.DataFrame(
|
110 |
{
|
111 |
"Gender": ["Male", "Female", "Male", "Female", "Male", "Female",
|
112 |
"Male", "Female", "Male", "Female"],
|
@@ -118,17 +133,16 @@ def _():
|
|
118 |
# query: average height of male and female after the age of 15 years
|
119 |
|
120 |
# filter, groupby and aggregation using method chaining
|
121 |
-
result_pl =
|
122 |
result_pl
|
123 |
-
return
|
124 |
|
125 |
|
126 |
-
@app.cell
|
127 |
def _(mo):
|
128 |
mo.md(
|
129 |
"""
|
130 |
Notice how Polars uses a *method-chaining* approach, similar to PySpark, which makes the code more readable and expressive while using a *single line* to design the query.
|
131 |
-
|
132 |
Additionally, Polars supports SQL-like operations *natively*, that allows you to write SQL queries directly on polars dataframe:
|
133 |
"""
|
134 |
)
|
@@ -136,13 +150,13 @@ def _(mo):
|
|
136 |
|
137 |
|
138 |
@app.cell
|
139 |
-
def _(
|
140 |
-
result =
|
141 |
result
|
142 |
return (result,)
|
143 |
|
144 |
|
145 |
-
@app.cell
|
146 |
def _(mo):
|
147 |
mo.md(
|
148 |
"""
|
@@ -154,7 +168,7 @@ def _(mo):
|
|
154 |
return
|
155 |
|
156 |
|
157 |
-
@app.cell
|
158 |
def _(mo):
|
159 |
mo.md(
|
160 |
"""
|
@@ -178,7 +192,7 @@ def _(mo):
|
|
178 |
return
|
179 |
|
180 |
|
181 |
-
@app.cell
|
182 |
def _(mo):
|
183 |
mo.md(
|
184 |
"""
|
@@ -211,7 +225,7 @@ def _(mo):
|
|
211 |
return
|
212 |
|
213 |
|
214 |
-
@app.cell
|
215 |
def _(mo):
|
216 |
mo.md(
|
217 |
"""
|
@@ -249,7 +263,7 @@ def _(mo):
|
|
249 |
return
|
250 |
|
251 |
|
252 |
-
@app.cell
|
253 |
def _(mo):
|
254 |
mo.md(
|
255 |
"""
|
@@ -268,7 +282,7 @@ def _(mo):
|
|
268 |
return
|
269 |
|
270 |
|
271 |
-
@app.cell
|
272 |
def _(mo):
|
273 |
mo.md(
|
274 |
"""
|
@@ -282,7 +296,7 @@ def _(mo):
|
|
282 |
return
|
283 |
|
284 |
|
285 |
-
@app.cell
|
286 |
def _(mo):
|
287 |
mo.md(
|
288 |
"""
|
|
|
9 |
|
10 |
import marimo
|
11 |
|
12 |
+
__generated_with = "0.11.8"
|
13 |
app = marimo.App(width="medium")
|
14 |
|
15 |
|
|
|
19 |
return (mo,)
|
20 |
|
21 |
|
22 |
+
@app.cell(hide_code=True)
|
23 |
def _(mo):
|
24 |
mo.md(
|
25 |
"""
|
26 |
# An introduction to Polars
|
27 |
|
28 |
This notebook provides a birds-eye overview of [Polars](https://pola.rs/), a fast and user-friendly data manipulation library for Python, and compares it to alternatives like Pandas and PySpark.
|
29 |
+
|
30 |
+
Like Pandas and PySpark, the central data structure in Polars is **the DataFrame**, a tabular data structure consisting of named columns. For example, the next cell constructs a DataFrame that records the gender, age, and height in centimeters for a number of individuals.
|
31 |
+
"""
|
32 |
+
)
|
33 |
+
return
|
34 |
+
|
35 |
+
|
36 |
+
@app.cell
|
37 |
+
def _():
|
38 |
+
import polars as pl
|
39 |
+
|
40 |
+
df_pl = pl.DataFrame(
|
41 |
+
{
|
42 |
+
"gender": ["Male", "Female", "Male", "Female", "Male", "Female",
|
43 |
+
"Male", "Female", "Male", "Female"],
|
44 |
+
"age": [13, 15, 17, 19, 21, 23, 25, 27, 29, 31],
|
45 |
+
"height_cm": [150.0, 170.0, 146.5, 142.0, 155.0, 165.0, 170.8, 130.0, 132.5, 162.0]
|
46 |
+
}
|
47 |
+
)
|
48 |
+
df_pl
|
49 |
+
return df_pl, pl
|
50 |
+
|
51 |
+
|
52 |
+
@app.cell(hide_code=True)
|
53 |
+
def _(mo):
|
54 |
+
mo.md(
|
55 |
+
"""
|
56 |
+
Unlike Python's earliest DataFrame library Pandas, Polars was designed with performance and usability in mind — Polars can scale to large datasets with ease while maintaining a simple and intuitive API.
|
57 |
|
58 |
Polars' performance is due to a number of factors, including its implementation and rust and its ability to perform operations in a parallelized and vectorized manner. It supports a wide range of data types, advanced query optimizations, and seamless integration with other Python libraries, making it a versatile tool for data scientists, engineers, and analysts. Additionally, Polars provides a lazy API for deferred execution, allowing users to optimize their workflows by chaining operations and executing them in a single pass.
|
59 |
|
|
|
63 |
return
|
64 |
|
65 |
|
66 |
+
@app.cell(hide_code=True)
|
67 |
def _(mo):
|
68 |
mo.md(
|
69 |
"""
|
70 |
## Choosing Polars over Pandas
|
71 |
|
|
|
72 |
In this section we'll give a few reasons why Polars is a better choice than Pandas, along with examples.
|
73 |
"""
|
74 |
)
|
75 |
return
|
76 |
|
77 |
|
78 |
+
@app.cell(hide_code=True)
|
79 |
def _(mo):
|
80 |
mo.md(
|
81 |
"""
|
82 |
### Intuitive syntax
|
83 |
|
84 |
Polars' syntax is similar to PySpark and intuitive like SQL, making heavy use of **method chaining**. This makes it easy for data professionals to transition to Polars, and leads to an API that is more concise and readable than Pandas.
|
85 |
+
|
86 |
**Example.** In the next few cells, we contrast the code to perform a basic filter and aggregation of data with Pandas to the code required to accomplish the same task with `Polars`.
|
87 |
"""
|
88 |
)
|
|
|
113 |
return df_pd, filtered_df_pd, pd, result_pd
|
114 |
|
115 |
|
116 |
+
@app.cell(hide_code=True)
|
117 |
def _(mo):
|
118 |
+
mo.md(r"""The same example can be worked out in Polars more concisely, using method chaining. Notice how the Polars code is essentially as readable as English.""")
|
|
|
|
|
|
|
|
|
119 |
return
|
120 |
|
121 |
|
122 |
@app.cell
|
123 |
+
def _(pl):
|
124 |
+
data_pl = pl.DataFrame(
|
|
|
|
|
125 |
{
|
126 |
"Gender": ["Male", "Female", "Male", "Female", "Male", "Female",
|
127 |
"Male", "Female", "Male", "Female"],
|
|
|
133 |
# query: average height of male and female after the age of 15 years
|
134 |
|
135 |
# filter, groupby and aggregation using method chaining
|
136 |
+
result_pl = data_pl.filter(pl.col("Age") > 15).group_by("Gender").agg(pl.mean("Height_CM"))
|
137 |
result_pl
|
138 |
+
return data_pl, result_pl
|
139 |
|
140 |
|
141 |
+
@app.cell(hide_code=True)
|
142 |
def _(mo):
|
143 |
mo.md(
|
144 |
"""
|
145 |
Notice how Polars uses a *method-chaining* approach, similar to PySpark, which makes the code more readable and expressive while using a *single line* to design the query.
|
|
|
146 |
Additionally, Polars supports SQL-like operations *natively*, that allows you to write SQL queries directly on polars dataframe:
|
147 |
"""
|
148 |
)
|
|
|
150 |
|
151 |
|
152 |
@app.cell
|
153 |
+
def _(data_pl):
|
154 |
+
result = data_pl.sql("SELECT Gender, AVG(Height_CM) FROM self WHERE Age > 15 GROUP BY Gender")
|
155 |
result
|
156 |
return (result,)
|
157 |
|
158 |
|
159 |
+
@app.cell(hide_code=True)
|
160 |
def _(mo):
|
161 |
mo.md(
|
162 |
"""
|
|
|
168 |
return
|
169 |
|
170 |
|
171 |
+
@app.cell(hide_code=True)
|
172 |
def _(mo):
|
173 |
mo.md(
|
174 |
"""
|
|
|
192 |
return
|
193 |
|
194 |
|
195 |
+
@app.cell(hide_code=True)
|
196 |
def _(mo):
|
197 |
mo.md(
|
198 |
"""
|
|
|
225 |
return
|
226 |
|
227 |
|
228 |
+
@app.cell(hide_code=True)
|
229 |
def _(mo):
|
230 |
mo.md(
|
231 |
"""
|
|
|
263 |
return
|
264 |
|
265 |
|
266 |
+
@app.cell(hide_code=True)
|
267 |
def _(mo):
|
268 |
mo.md(
|
269 |
"""
|
|
|
282 |
return
|
283 |
|
284 |
|
285 |
+
@app.cell(hide_code=True)
|
286 |
def _(mo):
|
287 |
mo.md(
|
288 |
"""
|
|
|
296 |
return
|
297 |
|
298 |
|
299 |
+
@app.cell(hide_code=True)
|
300 |
def _(mo):
|
301 |
mo.md(
|
302 |
"""
|