mgbam commited on
Commit
c039984
Β·
verified Β·
1 Parent(s): 132b0cb

Update core/analyzer.py

Browse files
Files changed (1) hide show
  1. core/analyzer.py +76 -26
core/analyzer.py CHANGED
@@ -2,16 +2,16 @@
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
- # PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
6
  #
7
- # DESCRIPTION: The core data analysis engine. This module is responsible for all
8
- # backend data profiling and statistical computation. It is fully
9
- # decoupled from any UI framework.
10
 
11
  from __future__ import annotations
12
  import logging
13
- from typing import Any, Dict, List, Tuple
14
  from functools import cached_property
 
15
 
16
  import numpy as np
17
  import pandas as pd
@@ -20,6 +20,75 @@ import plotly.graph_objects as go
20
 
21
  from core.exceptions import DataProcessingError
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  class DataAnalyzer:
24
  """
25
  A sophisticated data analysis and profiling engine.
@@ -41,20 +110,12 @@ class DataAnalyzer:
41
  def metadata(self) -> Dict[str, Any]:
42
  """
43
  Extracts and caches comprehensive metadata from the DataFrame.
44
-
45
- This property computes column types, data shape, memory usage, missing
46
- value statistics, and high-correlation pairs. The use of
47
- @cached_property ensures this expensive operation runs only once.
48
-
49
- Returns:
50
- A dictionary containing detailed dataset metadata.
51
  """
52
  rows, cols = self.df.shape
53
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
54
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
55
  datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
56
 
57
- # Identify potential long-form text columns for specialized analysis
58
  text_cols = [
59
  col for col in categorical_cols
60
  if self.df[col].dropna().str.len().mean() > 50
@@ -62,7 +123,7 @@ class DataAnalyzer:
62
 
63
  high_corr_pairs = []
64
  if len(numeric_cols) > 1:
65
- corr_matrix = self.df[numeric_cols].corr().abs()
66
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
67
  high_corr_series = upper_tri.stack()
68
  high_corr_pairs = (
@@ -88,25 +149,18 @@ class DataAnalyzer:
88
  def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
89
  """
90
  Generates detailed profiling reports for different data types.
91
-
92
- Returns:
93
- A tuple containing DataFrames for missing values, numeric stats,
94
- and categorical stats.
95
  """
96
- # Missing Value Report
97
  missing = self.df.isnull().sum()
98
  missing_df = pd.DataFrame({
99
  'Missing Values': missing,
100
  'Percentage (%)': (missing / len(self.df) * 100).round(2)
101
  }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
102
 
103
- # Numeric Stats Report
104
  numeric_stats_df = pd.DataFrame()
105
  if self.metadata['numeric_cols']:
106
  numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
107
  numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
108
 
109
- # Categorical Stats Report
110
  cat_stats_df = pd.DataFrame()
111
  if self.metadata['categorical_cols']:
112
  cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
@@ -117,10 +171,6 @@ class DataAnalyzer:
117
  def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
118
  """
119
  Generates a suite of overview plots for a birds-eye view of the data.
120
-
121
- Returns:
122
- A tuple of Plotly figures: Data Type Composition, Missing Values,
123
- and Correlation Matrix.
124
  """
125
  meta = self.metadata
126
  dtype_counts = self.df.dtypes.astype(str).value_counts()
@@ -139,7 +189,7 @@ class DataAnalyzer:
139
 
140
  fig_corr = go.Figure()
141
  if len(meta['numeric_cols']) > 1:
142
- corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
143
  fig_corr = px.imshow(
144
  corr_matrix, text_auto=".2f", aspect="auto",
145
  title="<b>πŸ”— Spearman Correlation Matrix</b>",
 
2
 
3
  # -*- coding: utf-8 -*-
4
  #
5
+ # PROJECT: CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
6
  #
7
+ # DESCRIPTION: The core data analysis engine and the new strategic feature
8
+ # engineering module. This file encapsulates all backend data
9
+ # profiling, statistical computation, and pre-processing logic.
10
 
11
  from __future__ import annotations
12
  import logging
 
13
  from functools import cached_property
14
+ from typing import Any, Dict, Tuple
15
 
16
  import numpy as np
17
  import pandas as pd
 
20
 
21
  from core.exceptions import DataProcessingError
22
 
23
+ # ======================================================================
24
+ # NEW: STRATEGIC FEATURE ENGINEERING MODULE
25
+ # ======================================================================
26
+ def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
27
+ """
28
+ Transforms a raw sales DataFrame into a feature-rich, model-ready dataset.
29
+
30
+ This function executes the strategic enhancements:
31
+ 1. Resolves multicollinearity by creating a 'Total_Revenue' feature.
32
+ 2. Parses compound address strings into distinct geospatial features.
33
+ 3. Engineers a rich set of temporal features from the order date.
34
+ 4. Drops redundant or low-value original columns.
35
+
36
+ Args:
37
+ df (pd.DataFrame): The raw input DataFrame.
38
+
39
+ Returns:
40
+ pd.DataFrame: The transformed and engineered DataFrame.
41
+ """
42
+ logging.info("Starting strategic feature engineering...")
43
+ df_eng = df.copy()
44
+
45
+ # Standardize column names for robustness (e.g., 'Price Each' -> 'Price_Each')
46
+ df_eng.columns = df_eng.columns.str.replace(' ', '_').str.replace(':', '')
47
+
48
+ # 1. Create Total_Revenue (Resolves Multicollinearity)
49
+ if 'Quantity_Ordered' in df_eng.columns and 'Price_Each' in df_eng.columns:
50
+ # Ensure columns are numeric, coercing errors to NaN
51
+ df_eng['Quantity_Ordered'] = pd.to_numeric(df_eng['Quantity_Ordered'], errors='coerce')
52
+ df_eng['Price_Each'] = pd.to_numeric(df_eng['Price_Each'], errors='coerce')
53
+ df_eng['Total_Revenue'] = df_eng['Quantity_Ordered'] * df_eng['Price_Each']
54
+ logging.info("Created 'Total_Revenue' feature.")
55
+
56
+ # 2. Engineer Temporal Features
57
+ if 'Order_Date' in df_eng.columns:
58
+ # Ensure column is in datetime format, coercing errors
59
+ df_eng['Order_Date_dt'] = pd.to_datetime(df_eng['Order_Date'], errors='coerce')
60
+
61
+ # Drop rows where date conversion or critical calculations failed
62
+ df_eng.dropna(subset=['Order_Date_dt', 'Total_Revenue'], inplace=True)
63
+
64
+ df_eng['Hour'] = df_eng['Order_Date_dt'].dt.hour
65
+ df_eng['Day_of_Week'] = df_eng['Order_Date_dt'].dt.dayofweek # Monday=0, Sunday=6
66
+ df_eng['Month'] = df_eng['Order_Date_dt'].dt.month
67
+ df_eng['Is_Weekend'] = (df_eng['Day_of_Week'] >= 5).astype(int)
68
+ logging.info("Engineered temporal features: Hour, Day_of_Week, Month, Is_Weekend.")
69
+
70
+ # 3. Engineer Geospatial Features from 'Purchase_Address'
71
+ if 'Purchase_Address' in df_eng.columns:
72
+ # Use a robust split and strip to extract city
73
+ df_eng['City'] = df_eng['Purchase_Address'].str.split(',').str[1].str.strip()
74
+ logging.info("Engineered 'City' feature from 'Purchase_Address'.")
75
+
76
+ # 4. Drop Redundant & Transformed Columns
77
+ columns_to_drop = [
78
+ 'Unnamed0', 'Order_ID', 'Sales', 'Price_Each',
79
+ 'Order_Date', 'Purchase_Address', 'Order_Date_dt'
80
+ ]
81
+ existing_cols_to_drop = [col for col in columns_to_drop if col in df_eng.columns]
82
+ df_eng = df_eng.drop(columns=existing_cols_to_drop)
83
+ logging.info(f"Dropped redundant columns: {existing_cols_to_drop}")
84
+
85
+ logging.info(f"Feature engineering complete. New shape: {df_eng.shape}")
86
+ return df_eng
87
+
88
+
89
+ # ======================================================================
90
+ # CORE DATA ANALYZER CLASS (Unchanged)
91
+ # ======================================================================
92
  class DataAnalyzer:
93
  """
94
  A sophisticated data analysis and profiling engine.
 
110
  def metadata(self) -> Dict[str, Any]:
111
  """
112
  Extracts and caches comprehensive metadata from the DataFrame.
 
 
 
 
 
 
 
113
  """
114
  rows, cols = self.df.shape
115
  numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
116
  categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
117
  datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
118
 
 
119
  text_cols = [
120
  col for col in categorical_cols
121
  if self.df[col].dropna().str.len().mean() > 50
 
123
 
124
  high_corr_pairs = []
125
  if len(numeric_cols) > 1:
126
+ corr_matrix = self.df[numeric_cols].corr(method='spearman').abs()
127
  upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
128
  high_corr_series = upper_tri.stack()
129
  high_corr_pairs = (
 
149
  def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
150
  """
151
  Generates detailed profiling reports for different data types.
 
 
 
 
152
  """
 
153
  missing = self.df.isnull().sum()
154
  missing_df = pd.DataFrame({
155
  'Missing Values': missing,
156
  'Percentage (%)': (missing / len(self.df) * 100).round(2)
157
  }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
158
 
 
159
  numeric_stats_df = pd.DataFrame()
160
  if self.metadata['numeric_cols']:
161
  numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
162
  numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
163
 
 
164
  cat_stats_df = pd.DataFrame()
165
  if self.metadata['categorical_cols']:
166
  cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
 
171
  def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
172
  """
173
  Generates a suite of overview plots for a birds-eye view of the data.
 
 
 
 
174
  """
175
  meta = self.metadata
176
  dtype_counts = self.df.dtypes.astype(str).value_counts()
 
189
 
190
  fig_corr = go.Figure()
191
  if len(meta['numeric_cols']) > 1:
192
+ corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman')
193
  fig_corr = px.imshow(
194
  corr_matrix, text_auto=".2f", aspect="auto",
195
  title="<b>πŸ”— Spearman Correlation Matrix</b>",