Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / modules /profiling.py

mgbam

Create profiling.py

58f2491 verified 10 days ago

raw

history blame contribute delete

4.02 kB

	# modules/profiling.py

	# -- coding: utf-8 --
	#
	# PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
	#
	# DESCRIPTION: A dedicated module for profiling and characterizing customer
	# segments identified through clustering.

	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import logging

	def profile_clusters(df: pd.DataFrame, cluster_labels: pd.Series, numeric_cols: list, cat_cols: list) -> tuple:
	"""
	Analyzes and profiles clusters to create meaningful business personas.

	This function groups the data by cluster and calculates key statistics
	for numeric and categorical features to describe each segment. It then
	visualizes these differences.

	Args:
	df (pd.DataFrame): The feature-engineered DataFrame.
	cluster_labels (pd.Series): The series of cluster labels from the K-Means model.
	numeric_cols (list): List of numeric columns to profile (e.g., ['Total_Revenue']).
	cat_cols (list): List of categorical columns to profile (e.g., ['City', 'Product']).

	Returns:
	A tuple containing:
	- A markdown string with the detailed profile of each cluster.
	- A Plotly Figure visualizing the differences between clusters.
	"""
	# Ensure the dataframe used for profiling has the same index as the labels
	profile_df = df.loc[cluster_labels.index].copy()
	profile_df['Cluster'] = cluster_labels

	if profile_df.empty:
	return "No data available to profile clusters.", go.Figure()

	logging.info(f"Profiling {profile_df['Cluster'].nunique()} clusters...")

	# --- Generate Markdown Report ---
	report_md = "### Cluster Persona Analysis\n\n"

	# Analyze numeric features by cluster
	numeric_profile = profile_df.groupby('Cluster')[numeric_cols].mean().round(2)

	# Analyze categorical features by cluster (get the most frequent value - mode)
	cat_profile_list = []
	for col in cat_cols:
	# This lambda is more robust for cases where a mode might not exist
	mode_series = profile_df.groupby('Cluster')[col].apply(lambda x: x.mode()[0] if not x.mode().empty else "N/A")
	mode_df = mode_series.to_frame()
	cat_profile_list.append(mode_df)

	full_profile = pd.concat([numeric_profile] + cat_profile_list, axis=1)

	for cluster_id in sorted(profile_df['Cluster'].unique()):
	# Try to name the persona by the dominant city, fall back to a generic name
	try:
	persona_name = full_profile.loc[cluster_id, 'City']
	except KeyError:
	persona_name = f"Segment {cluster_id}"

	report_md += f"#### Cluster {cluster_id}: The '{persona_name}' Persona\n"

	# Numeric Summary
	for col in numeric_cols:
	val = full_profile.loc[cluster_id, col]
	report_md += f"- Avg. {col.replace('_', ' ')}: `{val:,.2f}`\n"

	# Categorical Summary
	for col in cat_cols:
	val = full_profile.loc[cluster_id, col]
	report_md += f"- Dominant {col.replace('_', ' ')}: `{val}`\n"
	report_md += "\n"

	# --- Generate Visualization ---
	# We'll visualize the average 'Total_Revenue' by 'City' for each cluster
	# This directly tests our hypothesis that 'City' is the dominant feature.
	try:
	vis_df = profile_df.groupby(['Cluster', 'City'])['Total_Revenue'].mean().reset_index()

	fig = px.bar(
	vis_df,
	x='Cluster',
	y='Total_Revenue',
	color='City',
	barmode='group',
	title='<b>Cluster Profile: Avg. Total Revenue by Dominant City</b>',
	labels={'Total_Revenue': 'Average Total Revenue ($)', 'Cluster': 'Customer Segment'}
	)
	except Exception as e:
	logging.error(f"Could not generate profile visualization: {e}")
	fig = go.Figure().update_layout(title="Could not generate profile visualization.")

	return report_md, fig