Spaces:
Sleeping
Sleeping
File size: 6,288 Bytes
2a9e2e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# core/analyzer.py
# -*- coding: utf-8 -*-
#
# PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION: The core data analysis engine. This module is responsible for all
# backend data profiling and statistical computation. It is fully
# decoupled from any UI framework.
from __future__ import annotations
import logging
from typing import Any, Dict, List, Tuple
from functools import cached_property
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from core.exceptions import DataProcessingError
class DataAnalyzer:
"""
A sophisticated data analysis and profiling engine.
This class encapsulates all the logic for computing statistics, metadata,
and generating visualizations from a pandas DataFrame. It leverages
cached properties for efficient re-computation of metadata.
Args:
df (pd.DataFrame): The input DataFrame for analysis.
"""
def __init__(self, df: pd.DataFrame):
if not isinstance(df, pd.DataFrame) or df.empty:
raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
self.df = df
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
@cached_property
def metadata(self) -> Dict[str, Any]:
"""
Extracts and caches comprehensive metadata from the DataFrame.
This property computes column types, data shape, memory usage, missing
value statistics, and high-correlation pairs. The use of
@cached_property ensures this expensive operation runs only once.
Returns:
A dictionary containing detailed dataset metadata.
"""
rows, cols = self.df.shape
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
# Identify potential long-form text columns for specialized analysis
text_cols = [
col for col in categorical_cols
if self.df[col].dropna().str.len().mean() > 50
]
high_corr_pairs = []
if len(numeric_cols) > 1:
corr_matrix = self.df[numeric_cols].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_series = upper_tri.stack()
high_corr_pairs = (
high_corr_series[high_corr_series > 0.8]
.reset_index()
.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
.to_dict('records')
)
return {
'shape': (rows, cols),
'columns': self.df.columns.tolist(),
'numeric_cols': numeric_cols,
'categorical_cols': [c for c in categorical_cols if c not in text_cols],
'datetime_cols': datetime_cols,
'text_cols': text_cols,
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
'total_missing': int(self.df.isnull().sum().sum()),
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
'high_corr_pairs': high_corr_pairs,
}
def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Generates detailed profiling reports for different data types.
Returns:
A tuple containing DataFrames for missing values, numeric stats,
and categorical stats.
"""
# Missing Value Report
missing = self.df.isnull().sum()
missing_df = pd.DataFrame({
'Missing Values': missing,
'Percentage (%)': (missing / len(self.df) * 100).round(2)
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
# Numeric Stats Report
numeric_stats_df = pd.DataFrame()
if self.metadata['numeric_cols']:
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
# Categorical Stats Report
cat_stats_df = pd.DataFrame()
if self.metadata['categorical_cols']:
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})
return missing_df, numeric_stats_df, cat_stats_df
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
"""
Generates a suite of overview plots for a birds-eye view of the data.
Returns:
A tuple of Plotly figures: Data Type Composition, Missing Values,
and Correlation Matrix.
"""
meta = self.metadata
dtype_counts = self.df.dtypes.astype(str).value_counts()
fig_types = px.pie(
values=dtype_counts.values, names=dtype_counts.index,
title="<b>π Data Type Composition</b>", hole=0.4,
color_discrete_sequence=px.colors.qualitative.Pastel
)
missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
fig_missing = px.bar(
missing_df, x='index', y='count',
title="<b>π³οΈ Missing Values Distribution</b>",
labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
).update_xaxes(categoryorder="total descending")
fig_corr = go.Figure()
if len(meta['numeric_cols']) > 1:
corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
fig_corr = px.imshow(
corr_matrix, text_auto=".2f", aspect="auto",
title="<b>π Spearman Correlation Matrix</b>",
color_continuous_scale='RdBu_r', zmin=-1, zmax=1
)
return fig_types, fig_missing, fig_corr |