Spaces:
Sleeping
Sleeping
Update core/analyzer.py
Browse files- core/analyzer.py +76 -26
core/analyzer.py
CHANGED
@@ -2,16 +2,16 @@
|
|
2 |
|
3 |
# -*- coding: utf-8 -*-
|
4 |
#
|
5 |
-
# PROJECT: CognitiveEDA v5.
|
6 |
#
|
7 |
-
# DESCRIPTION: The core data analysis engine
|
8 |
-
#
|
9 |
-
#
|
10 |
|
11 |
from __future__ import annotations
|
12 |
import logging
|
13 |
-
from typing import Any, Dict, List, Tuple
|
14 |
from functools import cached_property
|
|
|
15 |
|
16 |
import numpy as np
|
17 |
import pandas as pd
|
@@ -20,6 +20,75 @@ import plotly.graph_objects as go
|
|
20 |
|
21 |
from core.exceptions import DataProcessingError
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
class DataAnalyzer:
|
24 |
"""
|
25 |
A sophisticated data analysis and profiling engine.
|
@@ -41,20 +110,12 @@ class DataAnalyzer:
|
|
41 |
def metadata(self) -> Dict[str, Any]:
|
42 |
"""
|
43 |
Extracts and caches comprehensive metadata from the DataFrame.
|
44 |
-
|
45 |
-
This property computes column types, data shape, memory usage, missing
|
46 |
-
value statistics, and high-correlation pairs. The use of
|
47 |
-
@cached_property ensures this expensive operation runs only once.
|
48 |
-
|
49 |
-
Returns:
|
50 |
-
A dictionary containing detailed dataset metadata.
|
51 |
"""
|
52 |
rows, cols = self.df.shape
|
53 |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
54 |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
55 |
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
|
56 |
|
57 |
-
# Identify potential long-form text columns for specialized analysis
|
58 |
text_cols = [
|
59 |
col for col in categorical_cols
|
60 |
if self.df[col].dropna().str.len().mean() > 50
|
@@ -62,7 +123,7 @@ class DataAnalyzer:
|
|
62 |
|
63 |
high_corr_pairs = []
|
64 |
if len(numeric_cols) > 1:
|
65 |
-
corr_matrix = self.df[numeric_cols].corr().abs()
|
66 |
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
67 |
high_corr_series = upper_tri.stack()
|
68 |
high_corr_pairs = (
|
@@ -88,25 +149,18 @@ class DataAnalyzer:
|
|
88 |
def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
89 |
"""
|
90 |
Generates detailed profiling reports for different data types.
|
91 |
-
|
92 |
-
Returns:
|
93 |
-
A tuple containing DataFrames for missing values, numeric stats,
|
94 |
-
and categorical stats.
|
95 |
"""
|
96 |
-
# Missing Value Report
|
97 |
missing = self.df.isnull().sum()
|
98 |
missing_df = pd.DataFrame({
|
99 |
'Missing Values': missing,
|
100 |
'Percentage (%)': (missing / len(self.df) * 100).round(2)
|
101 |
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
|
102 |
|
103 |
-
# Numeric Stats Report
|
104 |
numeric_stats_df = pd.DataFrame()
|
105 |
if self.metadata['numeric_cols']:
|
106 |
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
|
107 |
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
|
108 |
|
109 |
-
# Categorical Stats Report
|
110 |
cat_stats_df = pd.DataFrame()
|
111 |
if self.metadata['categorical_cols']:
|
112 |
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
|
@@ -117,10 +171,6 @@ class DataAnalyzer:
|
|
117 |
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
|
118 |
"""
|
119 |
Generates a suite of overview plots for a birds-eye view of the data.
|
120 |
-
|
121 |
-
Returns:
|
122 |
-
A tuple of Plotly figures: Data Type Composition, Missing Values,
|
123 |
-
and Correlation Matrix.
|
124 |
"""
|
125 |
meta = self.metadata
|
126 |
dtype_counts = self.df.dtypes.astype(str).value_counts()
|
@@ -139,7 +189,7 @@ class DataAnalyzer:
|
|
139 |
|
140 |
fig_corr = go.Figure()
|
141 |
if len(meta['numeric_cols']) > 1:
|
142 |
-
corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman')
|
143 |
fig_corr = px.imshow(
|
144 |
corr_matrix, text_auto=".2f", aspect="auto",
|
145 |
title="<b>π Spearman Correlation Matrix</b>",
|
|
|
2 |
|
3 |
# -*- coding: utf-8 -*-
|
4 |
#
|
5 |
+
# PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
|
6 |
#
|
7 |
+
# DESCRIPTION: The core data analysis engine and the new strategic feature
|
8 |
+
# engineering module. This file encapsulates all backend data
|
9 |
+
# profiling, statistical computation, and pre-processing logic.
|
10 |
|
11 |
from __future__ import annotations
|
12 |
import logging
|
|
|
13 |
from functools import cached_property
|
14 |
+
from typing import Any, Dict, Tuple
|
15 |
|
16 |
import numpy as np
|
17 |
import pandas as pd
|
|
|
20 |
|
21 |
from core.exceptions import DataProcessingError
|
22 |
|
23 |
+
# ======================================================================
|
24 |
+
# NEW: STRATEGIC FEATURE ENGINEERING MODULE
|
25 |
+
# ======================================================================
|
26 |
+
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
|
27 |
+
"""
|
28 |
+
Transforms a raw sales DataFrame into a feature-rich, model-ready dataset.
|
29 |
+
|
30 |
+
This function executes the strategic enhancements:
|
31 |
+
1. Resolves multicollinearity by creating a 'Total_Revenue' feature.
|
32 |
+
2. Parses compound address strings into distinct geospatial features.
|
33 |
+
3. Engineers a rich set of temporal features from the order date.
|
34 |
+
4. Drops redundant or low-value original columns.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
df (pd.DataFrame): The raw input DataFrame.
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
pd.DataFrame: The transformed and engineered DataFrame.
|
41 |
+
"""
|
42 |
+
logging.info("Starting strategic feature engineering...")
|
43 |
+
df_eng = df.copy()
|
44 |
+
|
45 |
+
# Standardize column names for robustness (e.g., 'Price Each' -> 'Price_Each')
|
46 |
+
df_eng.columns = df_eng.columns.str.replace(' ', '_').str.replace(':', '')
|
47 |
+
|
48 |
+
# 1. Create Total_Revenue (Resolves Multicollinearity)
|
49 |
+
if 'Quantity_Ordered' in df_eng.columns and 'Price_Each' in df_eng.columns:
|
50 |
+
# Ensure columns are numeric, coercing errors to NaN
|
51 |
+
df_eng['Quantity_Ordered'] = pd.to_numeric(df_eng['Quantity_Ordered'], errors='coerce')
|
52 |
+
df_eng['Price_Each'] = pd.to_numeric(df_eng['Price_Each'], errors='coerce')
|
53 |
+
df_eng['Total_Revenue'] = df_eng['Quantity_Ordered'] * df_eng['Price_Each']
|
54 |
+
logging.info("Created 'Total_Revenue' feature.")
|
55 |
+
|
56 |
+
# 2. Engineer Temporal Features
|
57 |
+
if 'Order_Date' in df_eng.columns:
|
58 |
+
# Ensure column is in datetime format, coercing errors
|
59 |
+
df_eng['Order_Date_dt'] = pd.to_datetime(df_eng['Order_Date'], errors='coerce')
|
60 |
+
|
61 |
+
# Drop rows where date conversion or critical calculations failed
|
62 |
+
df_eng.dropna(subset=['Order_Date_dt', 'Total_Revenue'], inplace=True)
|
63 |
+
|
64 |
+
df_eng['Hour'] = df_eng['Order_Date_dt'].dt.hour
|
65 |
+
df_eng['Day_of_Week'] = df_eng['Order_Date_dt'].dt.dayofweek # Monday=0, Sunday=6
|
66 |
+
df_eng['Month'] = df_eng['Order_Date_dt'].dt.month
|
67 |
+
df_eng['Is_Weekend'] = (df_eng['Day_of_Week'] >= 5).astype(int)
|
68 |
+
logging.info("Engineered temporal features: Hour, Day_of_Week, Month, Is_Weekend.")
|
69 |
+
|
70 |
+
# 3. Engineer Geospatial Features from 'Purchase_Address'
|
71 |
+
if 'Purchase_Address' in df_eng.columns:
|
72 |
+
# Use a robust split and strip to extract city
|
73 |
+
df_eng['City'] = df_eng['Purchase_Address'].str.split(',').str[1].str.strip()
|
74 |
+
logging.info("Engineered 'City' feature from 'Purchase_Address'.")
|
75 |
+
|
76 |
+
# 4. Drop Redundant & Transformed Columns
|
77 |
+
columns_to_drop = [
|
78 |
+
'Unnamed0', 'Order_ID', 'Sales', 'Price_Each',
|
79 |
+
'Order_Date', 'Purchase_Address', 'Order_Date_dt'
|
80 |
+
]
|
81 |
+
existing_cols_to_drop = [col for col in columns_to_drop if col in df_eng.columns]
|
82 |
+
df_eng = df_eng.drop(columns=existing_cols_to_drop)
|
83 |
+
logging.info(f"Dropped redundant columns: {existing_cols_to_drop}")
|
84 |
+
|
85 |
+
logging.info(f"Feature engineering complete. New shape: {df_eng.shape}")
|
86 |
+
return df_eng
|
87 |
+
|
88 |
+
|
89 |
+
# ======================================================================
|
90 |
+
# CORE DATA ANALYZER CLASS (Unchanged)
|
91 |
+
# ======================================================================
|
92 |
class DataAnalyzer:
|
93 |
"""
|
94 |
A sophisticated data analysis and profiling engine.
|
|
|
110 |
def metadata(self) -> Dict[str, Any]:
|
111 |
"""
|
112 |
Extracts and caches comprehensive metadata from the DataFrame.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
"""
|
114 |
rows, cols = self.df.shape
|
115 |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
116 |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
117 |
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
|
118 |
|
|
|
119 |
text_cols = [
|
120 |
col for col in categorical_cols
|
121 |
if self.df[col].dropna().str.len().mean() > 50
|
|
|
123 |
|
124 |
high_corr_pairs = []
|
125 |
if len(numeric_cols) > 1:
|
126 |
+
corr_matrix = self.df[numeric_cols].corr(method='spearman').abs()
|
127 |
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
128 |
high_corr_series = upper_tri.stack()
|
129 |
high_corr_pairs = (
|
|
|
149 |
def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
150 |
"""
|
151 |
Generates detailed profiling reports for different data types.
|
|
|
|
|
|
|
|
|
152 |
"""
|
|
|
153 |
missing = self.df.isnull().sum()
|
154 |
missing_df = pd.DataFrame({
|
155 |
'Missing Values': missing,
|
156 |
'Percentage (%)': (missing / len(self.df) * 100).round(2)
|
157 |
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
|
158 |
|
|
|
159 |
numeric_stats_df = pd.DataFrame()
|
160 |
if self.metadata['numeric_cols']:
|
161 |
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
|
162 |
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
|
163 |
|
|
|
164 |
cat_stats_df = pd.DataFrame()
|
165 |
if self.metadata['categorical_cols']:
|
166 |
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
|
|
|
171 |
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
|
172 |
"""
|
173 |
Generates a suite of overview plots for a birds-eye view of the data.
|
|
|
|
|
|
|
|
|
174 |
"""
|
175 |
meta = self.metadata
|
176 |
dtype_counts = self.df.dtypes.astype(str).value_counts()
|
|
|
189 |
|
190 |
fig_corr = go.Figure()
|
191 |
if len(meta['numeric_cols']) > 1:
|
192 |
+
corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman')
|
193 |
fig_corr = px.imshow(
|
194 |
corr_matrix, text_auto=".2f", aspect="auto",
|
195 |
title="<b>π Spearman Correlation Matrix</b>",
|