Spaces:
Sleeping
Sleeping
# core/analyzer.py | |
# -*- coding: utf-8 -*- | |
# | |
# PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform | |
# | |
# DESCRIPTION: The core data analysis engine and the new strategic feature | |
# engineering module. This file encapsulates all backend data | |
# profiling, statistical computation, and pre-processing logic. | |
from __future__ import annotations | |
import logging | |
from functools import cached_property | |
from typing import Any, Dict, Tuple | |
import numpy as np | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from core.exceptions import DataProcessingError | |
# ====================================================================== | |
# NEW: STRATEGIC FEATURE ENGINEERING MODULE | |
# ====================================================================== | |
def engineer_features(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Transforms a raw sales DataFrame into a feature-rich, model-ready dataset. | |
This function executes the strategic enhancements: | |
1. Resolves multicollinearity by creating a 'Total_Revenue' feature. | |
2. Parses compound address strings into distinct geospatial features. | |
3. Engineers a rich set of temporal features from the order date. | |
4. Drops redundant or low-value original columns. | |
Args: | |
df (pd.DataFrame): The raw input DataFrame. | |
Returns: | |
pd.DataFrame: The transformed and engineered DataFrame. | |
""" | |
logging.info("Starting strategic feature engineering...") | |
df_eng = df.copy() | |
# Standardize column names for robustness (e.g., 'Price Each' -> 'Price_Each') | |
df_eng.columns = df_eng.columns.str.replace(' ', '_').str.replace(':', '') | |
# 1. Create Total_Revenue (Resolves Multicollinearity) | |
if 'Quantity_Ordered' in df_eng.columns and 'Price_Each' in df_eng.columns: | |
# Ensure columns are numeric, coercing errors to NaN | |
df_eng['Quantity_Ordered'] = pd.to_numeric(df_eng['Quantity_Ordered'], errors='coerce') | |
df_eng['Price_Each'] = pd.to_numeric(df_eng['Price_Each'], errors='coerce') | |
df_eng['Total_Revenue'] = df_eng['Quantity_Ordered'] * df_eng['Price_Each'] | |
logging.info("Created 'Total_Revenue' feature.") | |
# 2. Engineer Temporal Features | |
if 'Order_Date' in df_eng.columns: | |
# Ensure column is in datetime format, coercing errors | |
df_eng['Order_Date_dt'] = pd.to_datetime(df_eng['Order_Date'], errors='coerce') | |
# Drop rows where date conversion or critical calculations failed | |
df_eng.dropna(subset=['Order_Date_dt', 'Total_Revenue'], inplace=True) | |
df_eng['Hour'] = df_eng['Order_Date_dt'].dt.hour | |
df_eng['Day_of_Week'] = df_eng['Order_Date_dt'].dt.dayofweek # Monday=0, Sunday=6 | |
df_eng['Month'] = df_eng['Order_Date_dt'].dt.month | |
df_eng['Is_Weekend'] = (df_eng['Day_of_Week'] >= 5).astype(int) | |
logging.info("Engineered temporal features: Hour, Day_of_Week, Month, Is_Weekend.") | |
# 3. Engineer Geospatial Features from 'Purchase_Address' | |
if 'Purchase_Address' in df_eng.columns: | |
# Use a robust split and strip to extract city | |
df_eng['City'] = df_eng['Purchase_Address'].str.split(',').str[1].str.strip() | |
logging.info("Engineered 'City' feature from 'Purchase_Address'.") | |
# 4. Drop Redundant & Transformed Columns | |
columns_to_drop = [ | |
'Unnamed0', 'Order_ID', 'Sales', 'Price_Each', | |
'Order_Date', 'Purchase_Address', 'Order_Date_dt' | |
] | |
existing_cols_to_drop = [col for col in columns_to_drop if col in df_eng.columns] | |
df_eng = df_eng.drop(columns=existing_cols_to_drop) | |
logging.info(f"Dropped redundant columns: {existing_cols_to_drop}") | |
logging.info(f"Feature engineering complete. New shape: {df_eng.shape}") | |
return df_eng | |
# ====================================================================== | |
# CORE DATA ANALYZER CLASS (Unchanged) | |
# ====================================================================== | |
class DataAnalyzer: | |
""" | |
A sophisticated data analysis and profiling engine. | |
This class encapsulates all the logic for computing statistics, metadata, | |
and generating visualizations from a pandas DataFrame. It leverages | |
cached properties for efficient re-computation of metadata. | |
Args: | |
df (pd.DataFrame): The input DataFrame for analysis. | |
""" | |
def __init__(self, df: pd.DataFrame): | |
if not isinstance(df, pd.DataFrame) or df.empty: | |
raise DataProcessingError("Input must be a non-empty pandas DataFrame.") | |
self.df = df | |
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}") | |
def metadata(self) -> Dict[str, Any]: | |
""" | |
Extracts and caches comprehensive metadata from the DataFrame. | |
""" | |
rows, cols = self.df.shape | |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist() | |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist() | |
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist() | |
text_cols = [ | |
col for col in categorical_cols | |
if self.df[col].dropna().str.len().mean() > 50 | |
] | |
high_corr_pairs = [] | |
if len(numeric_cols) > 1: | |
corr_matrix = self.df[numeric_cols].corr(method='spearman').abs() | |
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) | |
high_corr_series = upper_tri.stack() | |
high_corr_pairs = ( | |
high_corr_series[high_corr_series > 0.8] | |
.reset_index() | |
.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}) | |
.to_dict('records') | |
) | |
return { | |
'shape': (rows, cols), | |
'columns': self.df.columns.tolist(), | |
'numeric_cols': numeric_cols, | |
'categorical_cols': [c for c in categorical_cols if c not in text_cols], | |
'datetime_cols': datetime_cols, | |
'text_cols': text_cols, | |
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}", | |
'total_missing': int(self.df.isnull().sum().sum()), | |
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2), | |
'high_corr_pairs': high_corr_pairs, | |
} | |
def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
""" | |
Generates detailed profiling reports for different data types. | |
""" | |
missing = self.df.isnull().sum() | |
missing_df = pd.DataFrame({ | |
'Missing Values': missing, | |
'Percentage (%)': (missing / len(self.df) * 100).round(2) | |
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False) | |
numeric_stats_df = pd.DataFrame() | |
if self.metadata['numeric_cols']: | |
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T | |
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'}) | |
cat_stats_df = pd.DataFrame() | |
if self.metadata['categorical_cols']: | |
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T | |
cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'}) | |
return missing_df, numeric_stats_df, cat_stats_df | |
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]: | |
""" | |
Generates a suite of overview plots for a birds-eye view of the data. | |
""" | |
meta = self.metadata | |
dtype_counts = self.df.dtypes.astype(str).value_counts() | |
fig_types = px.pie( | |
values=dtype_counts.values, names=dtype_counts.index, | |
title="<b>π Data Type Composition</b>", hole=0.4, | |
color_discrete_sequence=px.colors.qualitative.Pastel | |
) | |
missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0') | |
fig_missing = px.bar( | |
missing_df, x='index', y='count', | |
title="<b>π³οΈ Missing Values Distribution</b>", | |
labels={'index': 'Column Name', 'count': 'Number of Missing Values'} | |
).update_xaxes(categoryorder="total descending") | |
fig_corr = go.Figure() | |
if len(meta['numeric_cols']) > 1: | |
corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') | |
fig_corr = px.imshow( | |
corr_matrix, text_auto=".2f", aspect="auto", | |
title="<b>π Spearman Correlation Matrix</b>", | |
color_continuous_scale='RdBu_r', zmin=-1, zmax=1 | |
) | |
return fig_types, fig_missing, fig_corr |