# modules/profiling.py # -*- coding: utf-8 -*- # # PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform # # DESCRIPTION: A dedicated module for profiling and characterizing customer # segments identified through clustering. import pandas as pd import plotly.express as px import plotly.graph_objects as go import logging def profile_clusters(df: pd.DataFrame, cluster_labels: pd.Series, numeric_cols: list, cat_cols: list) -> tuple: """ Analyzes and profiles clusters to create meaningful business personas. This function groups the data by cluster and calculates key statistics for numeric and categorical features to describe each segment. It then visualizes these differences. Args: df (pd.DataFrame): The feature-engineered DataFrame. cluster_labels (pd.Series): The series of cluster labels from the K-Means model. numeric_cols (list): List of numeric columns to profile (e.g., ['Total_Revenue']). cat_cols (list): List of categorical columns to profile (e.g., ['City', 'Product']). Returns: A tuple containing: - A markdown string with the detailed profile of each cluster. - A Plotly Figure visualizing the differences between clusters. """ # Ensure the dataframe used for profiling has the same index as the labels profile_df = df.loc[cluster_labels.index].copy() profile_df['Cluster'] = cluster_labels if profile_df.empty: return "No data available to profile clusters.", go.Figure() logging.info(f"Profiling {profile_df['Cluster'].nunique()} clusters...") # --- Generate Markdown Report --- report_md = "### Cluster Persona Analysis\n\n" # Analyze numeric features by cluster numeric_profile = profile_df.groupby('Cluster')[numeric_cols].mean().round(2) # Analyze categorical features by cluster (get the most frequent value - mode) cat_profile_list = [] for col in cat_cols: # This lambda is more robust for cases where a mode might not exist mode_series = profile_df.groupby('Cluster')[col].apply(lambda x: x.mode()[0] if not x.mode().empty else "N/A") mode_df = mode_series.to_frame() cat_profile_list.append(mode_df) full_profile = pd.concat([numeric_profile] + cat_profile_list, axis=1) for cluster_id in sorted(profile_df['Cluster'].unique()): # Try to name the persona by the dominant city, fall back to a generic name try: persona_name = full_profile.loc[cluster_id, 'City'] except KeyError: persona_name = f"Segment {cluster_id}" report_md += f"#### Cluster {cluster_id}: The '{persona_name}' Persona\n" # Numeric Summary for col in numeric_cols: val = full_profile.loc[cluster_id, col] report_md += f"- **Avg. {col.replace('_', ' ')}:** `{val:,.2f}`\n" # Categorical Summary for col in cat_cols: val = full_profile.loc[cluster_id, col] report_md += f"- **Dominant {col.replace('_', ' ')}:** `{val}`\n" report_md += "\n" # --- Generate Visualization --- # We'll visualize the average 'Total_Revenue' by 'City' for each cluster # This directly tests our hypothesis that 'City' is the dominant feature. try: vis_df = profile_df.groupby(['Cluster', 'City'])['Total_Revenue'].mean().reset_index() fig = px.bar( vis_df, x='Cluster', y='Total_Revenue', color='City', barmode='group', title='Cluster Profile: Avg. Total Revenue by Dominant City', labels={'Total_Revenue': 'Average Total Revenue ($)', 'Cluster': 'Customer Segment'} ) except Exception as e: logging.error(f"Could not generate profile visualization: {e}") fig = go.Figure().update_layout(title="Could not generate profile visualization.") return report_md, fig