Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / core /analyzer.py

mgbam

Update core/analyzer.py

c039984 verified 10 days ago

raw

history blame

8.93 kB

	# core/analyzer.py

	# -- coding: utf-8 --
	#
	# PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
	#
	# DESCRIPTION: The core data analysis engine and the new strategic feature
	# engineering module. This file encapsulates all backend data
	# profiling, statistical computation, and pre-processing logic.

	from __future__ import annotations
	import logging
	from functools import cached_property
	from typing import Any, Dict, Tuple

	import numpy as np
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go

	from core.exceptions import DataProcessingError

	# ======================================================================
	# NEW: STRATEGIC FEATURE ENGINEERING MODULE
	# ======================================================================
	def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Transforms a raw sales DataFrame into a feature-rich, model-ready dataset.

	This function executes the strategic enhancements:
	1. Resolves multicollinearity by creating a 'Total_Revenue' feature.
	2. Parses compound address strings into distinct geospatial features.
	3. Engineers a rich set of temporal features from the order date.
	4. Drops redundant or low-value original columns.

	Args:
	df (pd.DataFrame): The raw input DataFrame.

	Returns:
	pd.DataFrame: The transformed and engineered DataFrame.
	"""
	logging.info("Starting strategic feature engineering...")
	df_eng = df.copy()

	# Standardize column names for robustness (e.g., 'Price Each' -> 'Price_Each')
	df_eng.columns = df_eng.columns.str.replace(' ', '_').str.replace(':', '')

	# 1. Create Total_Revenue (Resolves Multicollinearity)
	if 'Quantity_Ordered' in df_eng.columns and 'Price_Each' in df_eng.columns:
	# Ensure columns are numeric, coercing errors to NaN
	df_eng['Quantity_Ordered'] = pd.to_numeric(df_eng['Quantity_Ordered'], errors='coerce')
	df_eng['Price_Each'] = pd.to_numeric(df_eng['Price_Each'], errors='coerce')
	df_eng['Total_Revenue'] = df_eng['Quantity_Ordered'] * df_eng['Price_Each']
	logging.info("Created 'Total_Revenue' feature.")

	# 2. Engineer Temporal Features
	if 'Order_Date' in df_eng.columns:
	# Ensure column is in datetime format, coercing errors
	df_eng['Order_Date_dt'] = pd.to_datetime(df_eng['Order_Date'], errors='coerce')

	# Drop rows where date conversion or critical calculations failed
	df_eng.dropna(subset=['Order_Date_dt', 'Total_Revenue'], inplace=True)

	df_eng['Hour'] = df_eng['Order_Date_dt'].dt.hour
	df_eng['Day_of_Week'] = df_eng['Order_Date_dt'].dt.dayofweek # Monday=0, Sunday=6
	df_eng['Month'] = df_eng['Order_Date_dt'].dt.month
	df_eng['Is_Weekend'] = (df_eng['Day_of_Week'] >= 5).astype(int)
	logging.info("Engineered temporal features: Hour, Day_of_Week, Month, Is_Weekend.")

	# 3. Engineer Geospatial Features from 'Purchase_Address'
	if 'Purchase_Address' in df_eng.columns:
	# Use a robust split and strip to extract city
	df_eng['City'] = df_eng['Purchase_Address'].str.split(',').str[1].str.strip()
	logging.info("Engineered 'City' feature from 'Purchase_Address'.")

	# 4. Drop Redundant & Transformed Columns
	columns_to_drop = [
	'Unnamed0', 'Order_ID', 'Sales', 'Price_Each',
	'Order_Date', 'Purchase_Address', 'Order_Date_dt'
	]
	existing_cols_to_drop = [col for col in columns_to_drop if col in df_eng.columns]
	df_eng = df_eng.drop(columns=existing_cols_to_drop)
	logging.info(f"Dropped redundant columns: {existing_cols_to_drop}")

	logging.info(f"Feature engineering complete. New shape: {df_eng.shape}")
	return df_eng


	# ======================================================================
	# CORE DATA ANALYZER CLASS (Unchanged)
	# ======================================================================
	class DataAnalyzer:
	"""
	A sophisticated data analysis and profiling engine.

	This class encapsulates all the logic for computing statistics, metadata,
	and generating visualizations from a pandas DataFrame. It leverages
	cached properties for efficient re-computation of metadata.

	Args:
	df (pd.DataFrame): The input DataFrame for analysis.
	"""
	def __init__(self, df: pd.DataFrame):
	if not isinstance(df, pd.DataFrame) or df.empty:
	raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
	self.df = df
	logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")

	@cached_property
	def metadata(self) -> Dict[str, Any]:
	"""
	Extracts and caches comprehensive metadata from the DataFrame.
	"""
	rows, cols = self.df.shape
	numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
	categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
	datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()

	text_cols = [
	col for col in categorical_cols
	if self.df[col].dropna().str.len().mean() > 50
	]

	high_corr_pairs = []
	if len(numeric_cols) > 1:
	corr_matrix = self.df[numeric_cols].corr(method='spearman').abs()
	upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
	high_corr_series = upper_tri.stack()
	high_corr_pairs = (
	high_corr_series[high_corr_series > 0.8]
	.reset_index()
	.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
	.to_dict('records')
	)

	return {
	'shape': (rows, cols),
	'columns': self.df.columns.tolist(),
	'numeric_cols': numeric_cols,
	'categorical_cols': [c for c in categorical_cols if c not in text_cols],
	'datetime_cols': datetime_cols,
	'text_cols': text_cols,
	'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
	'total_missing': int(self.df.isnull().sum().sum()),
	'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
	'high_corr_pairs': high_corr_pairs,
	}

	def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
	"""
	Generates detailed profiling reports for different data types.
	"""
	missing = self.df.isnull().sum()
	missing_df = pd.DataFrame({
	'Missing Values': missing,
	'Percentage (%)': (missing / len(self.df) * 100).round(2)
	}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)

	numeric_stats_df = pd.DataFrame()
	if self.metadata['numeric_cols']:
	numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
	numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})

	cat_stats_df = pd.DataFrame()
	if self.metadata['categorical_cols']:
	cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
	cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})

	return missing_df, numeric_stats_df, cat_stats_df

	def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
	"""
	Generates a suite of overview plots for a birds-eye view of the data.
	"""
	meta = self.metadata
	dtype_counts = self.df.dtypes.astype(str).value_counts()
	fig_types = px.pie(
	values=dtype_counts.values, names=dtype_counts.index,
	title="<b>📊 Data Type Composition</b>", hole=0.4,
	color_discrete_sequence=px.colors.qualitative.Pastel
	)

	missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
	fig_missing = px.bar(
	missing_df, x='index', y='count',
	title="<b>🕳️ Missing Values Distribution</b>",
	labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
	).update_xaxes(categoryorder="total descending")

	fig_corr = go.Figure()
	if len(meta['numeric_cols']) > 1:
	corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman')
	fig_corr = px.imshow(
	corr_matrix, text_auto=".2f", aspect="auto",
	title="<b>🔗 Spearman Correlation Matrix</b>",
	color_continuous_scale='RdBu_r', zmin=-1, zmax=1
	)
	return fig_types, fig_missing, fig_corr