Spaces:
Running
Running
"""Contains helper functions to tidy/format data and refactor bigger components on screen.""" | |
import pandas as pd | |
import vizro.models as vm | |
from vizro.figures import kpi_card_reference | |
from .config import NULL_VALUE, REGION_MAPPING, SELECTED_CUSTOMER_COLUMNS | |
def shorten_product_name(name, n_words=4): | |
"""Shortens product labels by reducing it to the first n words.""" | |
words = str(name).split() | |
return " ".join(words[:n_words]) | |
def tidy_orders_data(orders: pd.DataFrame): | |
"""Tidies and filters the data frame and creates additional columns.""" | |
# Remove rows where 'Category' is missing | |
orders = orders[orders["Category"].notna()] | |
orders = orders.dropna(subset=["Order Date"]) | |
# Convert to correct data types | |
orders["Order Date"] = pd.to_datetime(orders["Order Date"], format="%Y-%m-%d", errors="coerce") | |
# Create new columns | |
orders = orders.assign( | |
Region=orders["Shipping Address State"].map(REGION_MAPPING).fillna(NULL_VALUE), | |
Order_Value=orders["Purchase Price Per Unit"] * orders["Quantity"], | |
Short_Title=orders["Title"].apply(shorten_product_name), | |
Year=orders["Order Date"].dt.year.astype(str), | |
Month_Day=orders["Order Date"].dt.strftime("%b-%d"), | |
Month=orders["Order Date"].dt.month, | |
) | |
# Fill null values | |
orders["Shipping Address State"] = orders["Shipping Address State"].fillna(NULL_VALUE) | |
return orders | |
def create_kpi_data(orders): | |
"""Calculate KPIs and create correct data format.""" | |
df_kpi = ( | |
orders.groupby("Year") | |
.agg({"Order_Value": "sum", "Survey ResponseID": pd.Series.nunique, "Quantity": "sum"}) | |
.reset_index() | |
) | |
df_kpi = df_kpi.rename( | |
columns={ | |
"Order_Value": "Total order value", | |
"Survey ResponseID": "Number of customers", | |
"Quantity": "Total units ordered", | |
} | |
) | |
df_kpi["Avg product unit price"] = df_kpi["Total order value"] / df_kpi["Total units ordered"] | |
df_kpi["Total order value mil"] = df_kpi["Total order value"] / 1000000 | |
df_kpi["index"] = 0 | |
df_kpi = df_kpi.pivot( | |
index="index", | |
columns="Year", | |
values=[ | |
"Total order value mil", | |
"Number of customers", | |
"Total units ordered", | |
"Avg product unit price", | |
], | |
) | |
df_kpi.columns = [f"{kpi}_{year}" for kpi, year in df_kpi.columns] | |
return df_kpi | |
def create_kpi_container(data_frame: pd.DataFrame, id: str, layout: vm.Layout): | |
"""Creates reusable KPI container configuration.""" | |
container = vm.Container( | |
id=f"kpi-container-{id}", | |
title="", | |
layout=layout if layout else None, | |
components=[ | |
vm.Figure( | |
figure=kpi_card_reference( | |
data_frame, | |
value_column="Total order value mil_2021", | |
reference_column="Total order value mil_2020", | |
title="Total order value", | |
value_format="${value:.2f}M", | |
reference_format="{delta_relative:+.1%} vs. LY (${reference:.2f}M)", | |
icon="payments", | |
), | |
), | |
vm.Figure( | |
figure=kpi_card_reference( | |
data_frame, | |
value_column="Number of customers_2021", | |
reference_column="Number of customers_2020", | |
title="No. of customers", | |
value_format="{value:,.0f}", | |
reference_format="{delta_relative:+.1%} vs. LY ({reference:,.0f})", | |
icon="groups", | |
) | |
), | |
vm.Figure( | |
figure=kpi_card_reference( | |
data_frame, | |
value_column="Total units ordered_2021", | |
reference_column="Total units ordered_2020", | |
title="Total units ordered", | |
value_format="{value:,.0f}", | |
reference_format="{delta_relative:+.1%} vs. LY ({reference:,.0f})", | |
icon="production_quantity_limits", | |
) | |
), | |
vm.Figure( | |
figure=kpi_card_reference( | |
data_frame, | |
value_column="Avg product unit price_2021", | |
reference_column="Avg product unit price_2020", | |
title="Avg. unit price", | |
value_format="${value:.2f}", | |
reference_format="{delta_relative:+.1%} vs. LY (${reference:.2f})", | |
icon="price_change", | |
) | |
), | |
], | |
) | |
return container | |
def create_customer_df(orders_cy: pd.DataFrame, survey: pd.DataFrame): | |
"""Creates aggregated customer dataframe with socioeconomic columns added.""" | |
df_customer = ( | |
orders_cy.groupby(["Year", "Survey ResponseID", "Region", "Shipping Address State"]) | |
.agg( | |
{ | |
"Order_Value": "sum", | |
"Category": pd.Series.nunique, | |
"ASIN/ISBN (Product Code)": pd.Series.nunique, | |
"Order Date": pd.Series.nunique, | |
"Quantity": "sum", | |
} | |
) | |
.reset_index() | |
) | |
df_customer = df_customer.merge(survey, on="Survey ResponseID", how="left") | |
# Filter and rename columns for better understanding | |
df_customer = df_customer[SELECTED_CUSTOMER_COLUMNS] | |
df_customer = df_customer.rename( | |
columns={ | |
"Order_Value": "Total order value", | |
"Category": "Number of unique categories", | |
"ASIN/ISBN (Product Code)": "Number of unique products", | |
"Order Date": "Number of unique order dates", | |
"Quantity": "Total units ordered", | |
} | |
) | |
# Create new metrics | |
df_customer["Avg unit price"] = df_customer["Total order value"] / df_customer["Total units ordered"] | |
df_customer["Avg order value"] = df_customer["Total order value"] / df_customer["Number of unique order dates"] | |
df_customer["Quintiles"] = pd.qcut(df_customer["Total order value"], 5, labels=False) | |
return df_customer | |