# core/analyzer.py # -*- coding: utf-8 -*- # # PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform # # DESCRIPTION: The core data analysis engine. This module is responsible for all # backend data profiling and statistical computation. It is fully # decoupled from any UI framework. from __future__ import annotations import logging from typing import Any, Dict, List, Tuple from functools import cached_property import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go from core.exceptions import DataProcessingError class DataAnalyzer: """ A sophisticated data analysis and profiling engine. This class encapsulates all the logic for computing statistics, metadata, and generating visualizations from a pandas DataFrame. It leverages cached properties for efficient re-computation of metadata. Args: df (pd.DataFrame): The input DataFrame for analysis. """ def __init__(self, df: pd.DataFrame): if not isinstance(df, pd.DataFrame) or df.empty: raise DataProcessingError("Input must be a non-empty pandas DataFrame.") self.df = df logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}") @cached_property def metadata(self) -> Dict[str, Any]: """ Extracts and caches comprehensive metadata from the DataFrame. This property computes column types, data shape, memory usage, missing value statistics, and high-correlation pairs. The use of @cached_property ensures this expensive operation runs only once. Returns: A dictionary containing detailed dataset metadata. """ rows, cols = self.df.shape numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist() categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist() datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist() # Identify potential long-form text columns for specialized analysis text_cols = [ col for col in categorical_cols if self.df[col].dropna().str.len().mean() > 50 ] high_corr_pairs = [] if len(numeric_cols) > 1: corr_matrix = self.df[numeric_cols].corr().abs() upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) high_corr_series = upper_tri.stack() high_corr_pairs = ( high_corr_series[high_corr_series > 0.8] .reset_index() .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'}) .to_dict('records') ) return { 'shape': (rows, cols), 'columns': self.df.columns.tolist(), 'numeric_cols': numeric_cols, 'categorical_cols': [c for c in categorical_cols if c not in text_cols], 'datetime_cols': datetime_cols, 'text_cols': text_cols, 'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}", 'total_missing': int(self.df.isnull().sum().sum()), 'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2), 'high_corr_pairs': high_corr_pairs, } def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Generates detailed profiling reports for different data types. Returns: A tuple containing DataFrames for missing values, numeric stats, and categorical stats. """ # Missing Value Report missing = self.df.isnull().sum() missing_df = pd.DataFrame({ 'Missing Values': missing, 'Percentage (%)': (missing / len(self.df) * 100).round(2) }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False) # Numeric Stats Report numeric_stats_df = pd.DataFrame() if self.metadata['numeric_cols']: numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'}) # Categorical Stats Report cat_stats_df = pd.DataFrame() if self.metadata['categorical_cols']: cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'}) return missing_df, numeric_stats_df, cat_stats_df def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]: """ Generates a suite of overview plots for a birds-eye view of the data. Returns: A tuple of Plotly figures: Data Type Composition, Missing Values, and Correlation Matrix. """ meta = self.metadata dtype_counts = self.df.dtypes.astype(str).value_counts() fig_types = px.pie( values=dtype_counts.values, names=dtype_counts.index, title="📊 Data Type Composition", hole=0.4, color_discrete_sequence=px.colors.qualitative.Pastel ) missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0') fig_missing = px.bar( missing_df, x='index', y='count', title="🕳️ Missing Values Distribution", labels={'index': 'Column Name', 'count': 'Number of Missing Values'} ).update_xaxes(categoryorder="total descending") fig_corr = go.Figure() if len(meta['numeric_cols']) > 1: corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers fig_corr = px.imshow( corr_matrix, text_auto=".2f", aspect="auto", title="🔗 Spearman Correlation Matrix", color_continuous_scale='RdBu_r', zmin=-1, zmax=1 ) return fig_types, fig_missing, fig_corr