Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

mgbam commited on 10 days ago

Commit

2a9e2e4

verified ·

1 Parent(s): 9ca5a51

Create core/analyzer.py

Browse files

Files changed (1) hide show

core/analyzer.py +148 -0

core/analyzer.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# core/analyzer.py
+# -*- coding: utf-8 -*-
+#
+# PROJECT:      CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
+#
+# DESCRIPTION:  The core data analysis engine. This module is responsible for all
+#               backend data profiling and statistical computation. It is fully
+#               decoupled from any UI framework.
+from __future__ import annotations
+import logging
+from typing import Any, Dict, List, Tuple
+from functools import cached_property
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from core.exceptions import DataProcessingError
+class DataAnalyzer:
+    """
+    A sophisticated data analysis and profiling engine.
+    This class encapsulates all the logic for computing statistics, metadata,
+    and generating visualizations from a pandas DataFrame. It leverages
+    cached properties for efficient re-computation of metadata.
+    Args:
+        df (pd.DataFrame): The input DataFrame for analysis.
+    """
+    def __init__(self, df: pd.DataFrame):
+        if not isinstance(df, pd.DataFrame) or df.empty:
+            raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
+        self.df = df
+        logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
+    @cached_property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        Extracts and caches comprehensive metadata from the DataFrame.
+        This property computes column types, data shape, memory usage, missing
+        value statistics, and high-correlation pairs. The use of
+        @cached_property ensures this expensive operation runs only once.
+        Returns:
+            A dictionary containing detailed dataset metadata.
+        """
+        rows, cols = self.df.shape
+        numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
+        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
+        datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
+        # Identify potential long-form text columns for specialized analysis
+        text_cols = [
+            col for col in categorical_cols
+            if self.df[col].dropna().str.len().mean() > 50
+        ]
+        high_corr_pairs = []
+        if len(numeric_cols) > 1:
+            corr_matrix = self.df[numeric_cols].corr().abs()
+            upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
+            high_corr_series = upper_tri.stack()
+            high_corr_pairs = (
+                high_corr_series[high_corr_series > 0.8]
+                .reset_index()
+                .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
+                .to_dict('records')
+            )
+        return {
+            'shape': (rows, cols),
+            'columns': self.df.columns.tolist(),
+            'numeric_cols': numeric_cols,
+            'categorical_cols': [c for c in categorical_cols if c not in text_cols],
+            'datetime_cols': datetime_cols,
+            'text_cols': text_cols,
+            'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
+            'total_missing': int(self.df.isnull().sum().sum()),
+            'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
+            'high_corr_pairs': high_corr_pairs,
+        }
+    def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+        """
+        Generates detailed profiling reports for different data types.
+        Returns:
+            A tuple containing DataFrames for missing values, numeric stats,
+            and categorical stats.
+        """
+        # Missing Value Report
+        missing = self.df.isnull().sum()
+        missing_df = pd.DataFrame({
+            'Missing Values': missing,
+            'Percentage (%)': (missing / len(self.df) * 100).round(2)
+        }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
+        # Numeric Stats Report
+        numeric_stats_df = pd.DataFrame()
+        if self.metadata['numeric_cols']:
+            numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
+            numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
+        # Categorical Stats Report
+        cat_stats_df = pd.DataFrame()
+        if self.metadata['categorical_cols']:
+            cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
+            cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})
+        return missing_df, numeric_stats_df, cat_stats_df
+    def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
+        """
+        Generates a suite of overview plots for a birds-eye view of the data.
+        Returns:
+            A tuple of Plotly figures: Data Type Composition, Missing Values,
+            and Correlation Matrix.
+        """
+        meta = self.metadata
+        dtype_counts = self.df.dtypes.astype(str).value_counts()
+        fig_types = px.pie(
+            values=dtype_counts.values, names=dtype_counts.index,
+            title="<b>📊 Data Type Composition</b>", hole=0.4,
+            color_discrete_sequence=px.colors.qualitative.Pastel
+        )
+        missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
+        fig_missing = px.bar(
+            missing_df, x='index', y='count',
+            title="<b>🕳️ Missing Values Distribution</b>",
+            labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
+        ).update_xaxes(categoryorder="total descending")
+        fig_corr = go.Figure()
+        if len(meta['numeric_cols']) > 1:
+            corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
+            fig_corr = px.imshow(
+                corr_matrix, text_auto=".2f", aspect="auto",
+                title="<b>🔗 Spearman Correlation Matrix</b>",
+                color_continuous_scale='RdBu_r', zmin=-1, zmax=1
+            )
+        return fig_types, fig_missing, fig_corr