Spaces:
Sleeping
Sleeping
File size: 8,932 Bytes
2a9e2e4 c039984 2a9e2e4 c039984 2a9e2e4 c039984 2a9e2e4 c039984 2a9e2e4 c039984 2a9e2e4 c039984 2a9e2e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# core/analyzer.py
# -*- coding: utf-8 -*-
#
# PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION: The core data analysis engine and the new strategic feature
# engineering module. This file encapsulates all backend data
# profiling, statistical computation, and pre-processing logic.
from __future__ import annotations
import logging
from functools import cached_property
from typing import Any, Dict, Tuple
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from core.exceptions import DataProcessingError
# ======================================================================
# NEW: STRATEGIC FEATURE ENGINEERING MODULE
# ======================================================================
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Transforms a raw sales DataFrame into a feature-rich, model-ready dataset.
This function executes the strategic enhancements:
1. Resolves multicollinearity by creating a 'Total_Revenue' feature.
2. Parses compound address strings into distinct geospatial features.
3. Engineers a rich set of temporal features from the order date.
4. Drops redundant or low-value original columns.
Args:
df (pd.DataFrame): The raw input DataFrame.
Returns:
pd.DataFrame: The transformed and engineered DataFrame.
"""
logging.info("Starting strategic feature engineering...")
df_eng = df.copy()
# Standardize column names for robustness (e.g., 'Price Each' -> 'Price_Each')
df_eng.columns = df_eng.columns.str.replace(' ', '_').str.replace(':', '')
# 1. Create Total_Revenue (Resolves Multicollinearity)
if 'Quantity_Ordered' in df_eng.columns and 'Price_Each' in df_eng.columns:
# Ensure columns are numeric, coercing errors to NaN
df_eng['Quantity_Ordered'] = pd.to_numeric(df_eng['Quantity_Ordered'], errors='coerce')
df_eng['Price_Each'] = pd.to_numeric(df_eng['Price_Each'], errors='coerce')
df_eng['Total_Revenue'] = df_eng['Quantity_Ordered'] * df_eng['Price_Each']
logging.info("Created 'Total_Revenue' feature.")
# 2. Engineer Temporal Features
if 'Order_Date' in df_eng.columns:
# Ensure column is in datetime format, coercing errors
df_eng['Order_Date_dt'] = pd.to_datetime(df_eng['Order_Date'], errors='coerce')
# Drop rows where date conversion or critical calculations failed
df_eng.dropna(subset=['Order_Date_dt', 'Total_Revenue'], inplace=True)
df_eng['Hour'] = df_eng['Order_Date_dt'].dt.hour
df_eng['Day_of_Week'] = df_eng['Order_Date_dt'].dt.dayofweek # Monday=0, Sunday=6
df_eng['Month'] = df_eng['Order_Date_dt'].dt.month
df_eng['Is_Weekend'] = (df_eng['Day_of_Week'] >= 5).astype(int)
logging.info("Engineered temporal features: Hour, Day_of_Week, Month, Is_Weekend.")
# 3. Engineer Geospatial Features from 'Purchase_Address'
if 'Purchase_Address' in df_eng.columns:
# Use a robust split and strip to extract city
df_eng['City'] = df_eng['Purchase_Address'].str.split(',').str[1].str.strip()
logging.info("Engineered 'City' feature from 'Purchase_Address'.")
# 4. Drop Redundant & Transformed Columns
columns_to_drop = [
'Unnamed0', 'Order_ID', 'Sales', 'Price_Each',
'Order_Date', 'Purchase_Address', 'Order_Date_dt'
]
existing_cols_to_drop = [col for col in columns_to_drop if col in df_eng.columns]
df_eng = df_eng.drop(columns=existing_cols_to_drop)
logging.info(f"Dropped redundant columns: {existing_cols_to_drop}")
logging.info(f"Feature engineering complete. New shape: {df_eng.shape}")
return df_eng
# ======================================================================
# CORE DATA ANALYZER CLASS (Unchanged)
# ======================================================================
class DataAnalyzer:
"""
A sophisticated data analysis and profiling engine.
This class encapsulates all the logic for computing statistics, metadata,
and generating visualizations from a pandas DataFrame. It leverages
cached properties for efficient re-computation of metadata.
Args:
df (pd.DataFrame): The input DataFrame for analysis.
"""
def __init__(self, df: pd.DataFrame):
if not isinstance(df, pd.DataFrame) or df.empty:
raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
self.df = df
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
@cached_property
def metadata(self) -> Dict[str, Any]:
"""
Extracts and caches comprehensive metadata from the DataFrame.
"""
rows, cols = self.df.shape
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
text_cols = [
col for col in categorical_cols
if self.df[col].dropna().str.len().mean() > 50
]
high_corr_pairs = []
if len(numeric_cols) > 1:
corr_matrix = self.df[numeric_cols].corr(method='spearman').abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_series = upper_tri.stack()
high_corr_pairs = (
high_corr_series[high_corr_series > 0.8]
.reset_index()
.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
.to_dict('records')
)
return {
'shape': (rows, cols),
'columns': self.df.columns.tolist(),
'numeric_cols': numeric_cols,
'categorical_cols': [c for c in categorical_cols if c not in text_cols],
'datetime_cols': datetime_cols,
'text_cols': text_cols,
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
'total_missing': int(self.df.isnull().sum().sum()),
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
'high_corr_pairs': high_corr_pairs,
}
def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Generates detailed profiling reports for different data types.
"""
missing = self.df.isnull().sum()
missing_df = pd.DataFrame({
'Missing Values': missing,
'Percentage (%)': (missing / len(self.df) * 100).round(2)
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
numeric_stats_df = pd.DataFrame()
if self.metadata['numeric_cols']:
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
cat_stats_df = pd.DataFrame()
if self.metadata['categorical_cols']:
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})
return missing_df, numeric_stats_df, cat_stats_df
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
"""
Generates a suite of overview plots for a birds-eye view of the data.
"""
meta = self.metadata
dtype_counts = self.df.dtypes.astype(str).value_counts()
fig_types = px.pie(
values=dtype_counts.values, names=dtype_counts.index,
title="<b>π Data Type Composition</b>", hole=0.4,
color_discrete_sequence=px.colors.qualitative.Pastel
)
missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
fig_missing = px.bar(
missing_df, x='index', y='count',
title="<b>π³οΈ Missing Values Distribution</b>",
labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
).update_xaxes(categoryorder="total descending")
fig_corr = go.Figure()
if len(meta['numeric_cols']) > 1:
corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman')
fig_corr = px.imshow(
corr_matrix, text_auto=".2f", aspect="auto",
title="<b>π Spearman Correlation Matrix</b>",
color_continuous_scale='RdBu_r', zmin=-1, zmax=1
)
return fig_types, fig_missing, fig_corr |