CognitiveEDA / modules /profiling.py
mgbam's picture
Create profiling.py
58f2491 verified
# modules/profiling.py
# -*- coding: utf-8 -*-
#
# PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION: A dedicated module for profiling and characterizing customer
# segments identified through clustering.
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import logging
def profile_clusters(df: pd.DataFrame, cluster_labels: pd.Series, numeric_cols: list, cat_cols: list) -> tuple:
"""
Analyzes and profiles clusters to create meaningful business personas.
This function groups the data by cluster and calculates key statistics
for numeric and categorical features to describe each segment. It then
visualizes these differences.
Args:
df (pd.DataFrame): The feature-engineered DataFrame.
cluster_labels (pd.Series): The series of cluster labels from the K-Means model.
numeric_cols (list): List of numeric columns to profile (e.g., ['Total_Revenue']).
cat_cols (list): List of categorical columns to profile (e.g., ['City', 'Product']).
Returns:
A tuple containing:
- A markdown string with the detailed profile of each cluster.
- A Plotly Figure visualizing the differences between clusters.
"""
# Ensure the dataframe used for profiling has the same index as the labels
profile_df = df.loc[cluster_labels.index].copy()
profile_df['Cluster'] = cluster_labels
if profile_df.empty:
return "No data available to profile clusters.", go.Figure()
logging.info(f"Profiling {profile_df['Cluster'].nunique()} clusters...")
# --- Generate Markdown Report ---
report_md = "### Cluster Persona Analysis\n\n"
# Analyze numeric features by cluster
numeric_profile = profile_df.groupby('Cluster')[numeric_cols].mean().round(2)
# Analyze categorical features by cluster (get the most frequent value - mode)
cat_profile_list = []
for col in cat_cols:
# This lambda is more robust for cases where a mode might not exist
mode_series = profile_df.groupby('Cluster')[col].apply(lambda x: x.mode()[0] if not x.mode().empty else "N/A")
mode_df = mode_series.to_frame()
cat_profile_list.append(mode_df)
full_profile = pd.concat([numeric_profile] + cat_profile_list, axis=1)
for cluster_id in sorted(profile_df['Cluster'].unique()):
# Try to name the persona by the dominant city, fall back to a generic name
try:
persona_name = full_profile.loc[cluster_id, 'City']
except KeyError:
persona_name = f"Segment {cluster_id}"
report_md += f"#### Cluster {cluster_id}: The '{persona_name}' Persona\n"
# Numeric Summary
for col in numeric_cols:
val = full_profile.loc[cluster_id, col]
report_md += f"- **Avg. {col.replace('_', ' ')}:** `{val:,.2f}`\n"
# Categorical Summary
for col in cat_cols:
val = full_profile.loc[cluster_id, col]
report_md += f"- **Dominant {col.replace('_', ' ')}:** `{val}`\n"
report_md += "\n"
# --- Generate Visualization ---
# We'll visualize the average 'Total_Revenue' by 'City' for each cluster
# This directly tests our hypothesis that 'City' is the dominant feature.
try:
vis_df = profile_df.groupby(['Cluster', 'City'])['Total_Revenue'].mean().reset_index()
fig = px.bar(
vis_df,
x='Cluster',
y='Total_Revenue',
color='City',
barmode='group',
title='<b>Cluster Profile: Avg. Total Revenue by Dominant City</b>',
labels={'Total_Revenue': 'Average Total Revenue ($)', 'Cluster': 'Customer Segment'}
)
except Exception as e:
logging.error(f"Could not generate profile visualization: {e}")
fig = go.Figure().update_layout(title="Could not generate profile visualization.")
return report_md, fig