Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

mgbam commited on 9 days ago

Commit

c039984

verified ·

1 Parent(s): 132b0cb

Update core/analyzer.py

Browse files

Files changed (1) hide show

core/analyzer.py +76 -26

core/analyzer.py CHANGED Viewed

@@ -2,16 +2,16 @@
 # -*- coding: utf-8 -*-
 #
-# PROJECT:      CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
 #
-# DESCRIPTION:  The core data analysis engine. This module is responsible for all
-#               backend data profiling and statistical computation. It is fully
-#               decoupled from any UI framework.
 from __future__ import annotations
 import logging
-from typing import Any, Dict, List, Tuple
 from functools import cached_property
 import numpy as np
 import pandas as pd
@@ -20,6 +20,75 @@ import plotly.graph_objects as go
 from core.exceptions import DataProcessingError
 class DataAnalyzer:
     """
     A sophisticated data analysis and profiling engine.
@@ -41,20 +110,12 @@ class DataAnalyzer:
     def metadata(self) -> Dict[str, Any]:
         """
         Extracts and caches comprehensive metadata from the DataFrame.
-        This property computes column types, data shape, memory usage, missing
-        value statistics, and high-correlation pairs. The use of
-        @cached_property ensures this expensive operation runs only once.
-        Returns:
-            A dictionary containing detailed dataset metadata.
         """
         rows, cols = self.df.shape
         numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
         categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
         datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
-        # Identify potential long-form text columns for specialized analysis
         text_cols = [
             col for col in categorical_cols
             if self.df[col].dropna().str.len().mean() > 50
@@ -62,7 +123,7 @@ class DataAnalyzer:
         high_corr_pairs = []
         if len(numeric_cols) > 1:
-            corr_matrix = self.df[numeric_cols].corr().abs()
             upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
             high_corr_series = upper_tri.stack()
             high_corr_pairs = (
@@ -88,25 +149,18 @@ class DataAnalyzer:
     def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         Generates detailed profiling reports for different data types.
-        Returns:
-            A tuple containing DataFrames for missing values, numeric stats,
-            and categorical stats.
         """
-        # Missing Value Report
         missing = self.df.isnull().sum()
         missing_df = pd.DataFrame({
             'Missing Values': missing,
             'Percentage (%)': (missing / len(self.df) * 100).round(2)
         }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
-        # Numeric Stats Report
         numeric_stats_df = pd.DataFrame()
         if self.metadata['numeric_cols']:
             numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
             numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
-        # Categorical Stats Report
         cat_stats_df = pd.DataFrame()
         if self.metadata['categorical_cols']:
             cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
@@ -117,10 +171,6 @@ class DataAnalyzer:
     def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
         """
         Generates a suite of overview plots for a birds-eye view of the data.
-        Returns:
-            A tuple of Plotly figures: Data Type Composition, Missing Values,
-            and Correlation Matrix.
         """
         meta = self.metadata
         dtype_counts = self.df.dtypes.astype(str).value_counts()
@@ -139,7 +189,7 @@ class DataAnalyzer:
         fig_corr = go.Figure()
         if len(meta['numeric_cols']) > 1:
-            corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
             fig_corr = px.imshow(
                 corr_matrix, text_auto=".2f", aspect="auto",
                 title="<b>🔗 Spearman Correlation Matrix</b>",

 # -*- coding: utf-8 -*-
 #
+# PROJECT:      CognitiveEDA v5.6 - The QuantumLeap Intelligence Platform
 #
+# DESCRIPTION:  The core data analysis engine and the new strategic feature
+#               engineering module. This file encapsulates all backend data
+#               profiling, statistical computation, and pre-processing logic.
 from __future__ import annotations
 import logging
 from functools import cached_property
+from typing import Any, Dict, Tuple
 import numpy as np
 import pandas as pd
 from core.exceptions import DataProcessingError
+# ======================================================================
+# NEW: STRATEGIC FEATURE ENGINEERING MODULE
+# ======================================================================
+def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Transforms a raw sales DataFrame into a feature-rich, model-ready dataset.
+    This function executes the strategic enhancements:
+    1.  Resolves multicollinearity by creating a 'Total_Revenue' feature.
+    2.  Parses compound address strings into distinct geospatial features.
+    3.  Engineers a rich set of temporal features from the order date.
+    4.  Drops redundant or low-value original columns.
+    Args:
+        df (pd.DataFrame): The raw input DataFrame.
+    Returns:
+        pd.DataFrame: The transformed and engineered DataFrame.
+    """
+    logging.info("Starting strategic feature engineering...")
+    df_eng = df.copy()
+    # Standardize column names for robustness (e.g., 'Price Each' -> 'Price_Each')
+    df_eng.columns = df_eng.columns.str.replace(' ', '_').str.replace(':', '')
+    # 1. Create Total_Revenue (Resolves Multicollinearity)
+    if 'Quantity_Ordered' in df_eng.columns and 'Price_Each' in df_eng.columns:
+        # Ensure columns are numeric, coercing errors to NaN
+        df_eng['Quantity_Ordered'] = pd.to_numeric(df_eng['Quantity_Ordered'], errors='coerce')
+        df_eng['Price_Each'] = pd.to_numeric(df_eng['Price_Each'], errors='coerce')
+        df_eng['Total_Revenue'] = df_eng['Quantity_Ordered'] * df_eng['Price_Each']
+        logging.info("Created 'Total_Revenue' feature.")
+    # 2. Engineer Temporal Features
+    if 'Order_Date' in df_eng.columns:
+        # Ensure column is in datetime format, coercing errors
+        df_eng['Order_Date_dt'] = pd.to_datetime(df_eng['Order_Date'], errors='coerce')
+        # Drop rows where date conversion or critical calculations failed
+        df_eng.dropna(subset=['Order_Date_dt', 'Total_Revenue'], inplace=True)
+        df_eng['Hour'] = df_eng['Order_Date_dt'].dt.hour
+        df_eng['Day_of_Week'] = df_eng['Order_Date_dt'].dt.dayofweek # Monday=0, Sunday=6
+        df_eng['Month'] = df_eng['Order_Date_dt'].dt.month
+        df_eng['Is_Weekend'] = (df_eng['Day_of_Week'] >= 5).astype(int)
+        logging.info("Engineered temporal features: Hour, Day_of_Week, Month, Is_Weekend.")
+    # 3. Engineer Geospatial Features from 'Purchase_Address'
+    if 'Purchase_Address' in df_eng.columns:
+        # Use a robust split and strip to extract city
+        df_eng['City'] = df_eng['Purchase_Address'].str.split(',').str[1].str.strip()
+        logging.info("Engineered 'City' feature from 'Purchase_Address'.")
+    # 4. Drop Redundant & Transformed Columns
+    columns_to_drop = [
+        'Unnamed0', 'Order_ID', 'Sales', 'Price_Each',
+        'Order_Date', 'Purchase_Address', 'Order_Date_dt'
+    ]
+    existing_cols_to_drop = [col for col in columns_to_drop if col in df_eng.columns]
+    df_eng = df_eng.drop(columns=existing_cols_to_drop)
+    logging.info(f"Dropped redundant columns: {existing_cols_to_drop}")
+    logging.info(f"Feature engineering complete. New shape: {df_eng.shape}")
+    return df_eng
+# ======================================================================
+# CORE DATA ANALYZER CLASS (Unchanged)
+# ======================================================================
 class DataAnalyzer:
     """
     A sophisticated data analysis and profiling engine.
     def metadata(self) -> Dict[str, Any]:
         """
         Extracts and caches comprehensive metadata from the DataFrame.
         """
         rows, cols = self.df.shape
         numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
         categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
         datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
         text_cols = [
             col for col in categorical_cols
             if self.df[col].dropna().str.len().mean() > 50
         high_corr_pairs = []
         if len(numeric_cols) > 1:
+            corr_matrix = self.df[numeric_cols].corr(method='spearman').abs()
             upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
             high_corr_series = upper_tri.stack()
             high_corr_pairs = (
     def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """
         Generates detailed profiling reports for different data types.
         """
         missing = self.df.isnull().sum()
         missing_df = pd.DataFrame({
             'Missing Values': missing,
             'Percentage (%)': (missing / len(self.df) * 100).round(2)
         }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
         numeric_stats_df = pd.DataFrame()
         if self.metadata['numeric_cols']:
             numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
             numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
         cat_stats_df = pd.DataFrame()
         if self.metadata['categorical_cols']:
             cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
     def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
         """
         Generates a suite of overview plots for a birds-eye view of the data.
         """
         meta = self.metadata
         dtype_counts = self.df.dtypes.astype(str).value_counts()
         fig_corr = go.Figure()
         if len(meta['numeric_cols']) > 1:
+            corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman')
             fig_corr = px.imshow(
                 corr_matrix, text_auto=".2f", aspect="auto",
                 title="<b>🔗 Spearman Correlation Matrix</b>",