Spaces:
Sleeping
Sleeping
Create core/analyzer.py
Browse files- core/analyzer.py +148 -0
core/analyzer.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# core/analyzer.py
|
2 |
+
|
3 |
+
# -*- coding: utf-8 -*-
|
4 |
+
#
|
5 |
+
# PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
|
6 |
+
#
|
7 |
+
# DESCRIPTION: The core data analysis engine. This module is responsible for all
|
8 |
+
# backend data profiling and statistical computation. It is fully
|
9 |
+
# decoupled from any UI framework.
|
10 |
+
|
11 |
+
from __future__ import annotations
|
12 |
+
import logging
|
13 |
+
from typing import Any, Dict, List, Tuple
|
14 |
+
from functools import cached_property
|
15 |
+
|
16 |
+
import numpy as np
|
17 |
+
import pandas as pd
|
18 |
+
import plotly.express as px
|
19 |
+
import plotly.graph_objects as go
|
20 |
+
|
21 |
+
from core.exceptions import DataProcessingError
|
22 |
+
|
23 |
+
class DataAnalyzer:
|
24 |
+
"""
|
25 |
+
A sophisticated data analysis and profiling engine.
|
26 |
+
|
27 |
+
This class encapsulates all the logic for computing statistics, metadata,
|
28 |
+
and generating visualizations from a pandas DataFrame. It leverages
|
29 |
+
cached properties for efficient re-computation of metadata.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
df (pd.DataFrame): The input DataFrame for analysis.
|
33 |
+
"""
|
34 |
+
def __init__(self, df: pd.DataFrame):
|
35 |
+
if not isinstance(df, pd.DataFrame) or df.empty:
|
36 |
+
raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
|
37 |
+
self.df = df
|
38 |
+
logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
|
39 |
+
|
40 |
+
@cached_property
|
41 |
+
def metadata(self) -> Dict[str, Any]:
|
42 |
+
"""
|
43 |
+
Extracts and caches comprehensive metadata from the DataFrame.
|
44 |
+
|
45 |
+
This property computes column types, data shape, memory usage, missing
|
46 |
+
value statistics, and high-correlation pairs. The use of
|
47 |
+
@cached_property ensures this expensive operation runs only once.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
A dictionary containing detailed dataset metadata.
|
51 |
+
"""
|
52 |
+
rows, cols = self.df.shape
|
53 |
+
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
54 |
+
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
55 |
+
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
|
56 |
+
|
57 |
+
# Identify potential long-form text columns for specialized analysis
|
58 |
+
text_cols = [
|
59 |
+
col for col in categorical_cols
|
60 |
+
if self.df[col].dropna().str.len().mean() > 50
|
61 |
+
]
|
62 |
+
|
63 |
+
high_corr_pairs = []
|
64 |
+
if len(numeric_cols) > 1:
|
65 |
+
corr_matrix = self.df[numeric_cols].corr().abs()
|
66 |
+
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
67 |
+
high_corr_series = upper_tri.stack()
|
68 |
+
high_corr_pairs = (
|
69 |
+
high_corr_series[high_corr_series > 0.8]
|
70 |
+
.reset_index()
|
71 |
+
.rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
|
72 |
+
.to_dict('records')
|
73 |
+
)
|
74 |
+
|
75 |
+
return {
|
76 |
+
'shape': (rows, cols),
|
77 |
+
'columns': self.df.columns.tolist(),
|
78 |
+
'numeric_cols': numeric_cols,
|
79 |
+
'categorical_cols': [c for c in categorical_cols if c not in text_cols],
|
80 |
+
'datetime_cols': datetime_cols,
|
81 |
+
'text_cols': text_cols,
|
82 |
+
'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
|
83 |
+
'total_missing': int(self.df.isnull().sum().sum()),
|
84 |
+
'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
|
85 |
+
'high_corr_pairs': high_corr_pairs,
|
86 |
+
}
|
87 |
+
|
88 |
+
def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
89 |
+
"""
|
90 |
+
Generates detailed profiling reports for different data types.
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
A tuple containing DataFrames for missing values, numeric stats,
|
94 |
+
and categorical stats.
|
95 |
+
"""
|
96 |
+
# Missing Value Report
|
97 |
+
missing = self.df.isnull().sum()
|
98 |
+
missing_df = pd.DataFrame({
|
99 |
+
'Missing Values': missing,
|
100 |
+
'Percentage (%)': (missing / len(self.df) * 100).round(2)
|
101 |
+
}).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
|
102 |
+
|
103 |
+
# Numeric Stats Report
|
104 |
+
numeric_stats_df = pd.DataFrame()
|
105 |
+
if self.metadata['numeric_cols']:
|
106 |
+
numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
|
107 |
+
numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
|
108 |
+
|
109 |
+
# Categorical Stats Report
|
110 |
+
cat_stats_df = pd.DataFrame()
|
111 |
+
if self.metadata['categorical_cols']:
|
112 |
+
cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
|
113 |
+
cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})
|
114 |
+
|
115 |
+
return missing_df, numeric_stats_df, cat_stats_df
|
116 |
+
|
117 |
+
def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
|
118 |
+
"""
|
119 |
+
Generates a suite of overview plots for a birds-eye view of the data.
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
A tuple of Plotly figures: Data Type Composition, Missing Values,
|
123 |
+
and Correlation Matrix.
|
124 |
+
"""
|
125 |
+
meta = self.metadata
|
126 |
+
dtype_counts = self.df.dtypes.astype(str).value_counts()
|
127 |
+
fig_types = px.pie(
|
128 |
+
values=dtype_counts.values, names=dtype_counts.index,
|
129 |
+
title="<b>π Data Type Composition</b>", hole=0.4,
|
130 |
+
color_discrete_sequence=px.colors.qualitative.Pastel
|
131 |
+
)
|
132 |
+
|
133 |
+
missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
|
134 |
+
fig_missing = px.bar(
|
135 |
+
missing_df, x='index', y='count',
|
136 |
+
title="<b>π³οΈ Missing Values Distribution</b>",
|
137 |
+
labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
|
138 |
+
).update_xaxes(categoryorder="total descending")
|
139 |
+
|
140 |
+
fig_corr = go.Figure()
|
141 |
+
if len(meta['numeric_cols']) > 1:
|
142 |
+
corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
|
143 |
+
fig_corr = px.imshow(
|
144 |
+
corr_matrix, text_auto=".2f", aspect="auto",
|
145 |
+
title="<b>π Spearman Correlation Matrix</b>",
|
146 |
+
color_continuous_scale='RdBu_r', zmin=-1, zmax=1
|
147 |
+
)
|
148 |
+
return fig_types, fig_missing, fig_corr
|