mgbam commited on
Commit
2a9e2e4
Β·
verified Β·
1 Parent(s): 9ca5a51

Create core/analyzer.py

Browse files
Files changed (1) hide show
  1. core/analyzer.py +148 -0
core/analyzer.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/analyzer.py
2
+
3
+ # -*- coding: utf-8 -*-
4
+ #
5
+ # PROJECT: CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
6
+ #
7
+ # DESCRIPTION: The core data analysis engine. This module is responsible for all
8
+ # backend data profiling and statistical computation. It is fully
9
+ # decoupled from any UI framework.
10
+
11
+ from __future__ import annotations
12
+ import logging
13
+ from typing import Any, Dict, List, Tuple
14
+ from functools import cached_property
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import plotly.express as px
19
+ import plotly.graph_objects as go
20
+
21
+ from core.exceptions import DataProcessingError
22
+
23
+ class DataAnalyzer:
24
+ """
25
+ A sophisticated data analysis and profiling engine.
26
+
27
+ This class encapsulates all the logic for computing statistics, metadata,
28
+ and generating visualizations from a pandas DataFrame. It leverages
29
+ cached properties for efficient re-computation of metadata.
30
+
31
+ Args:
32
+ df (pd.DataFrame): The input DataFrame for analysis.
33
+ """
34
+ def __init__(self, df: pd.DataFrame):
35
+ if not isinstance(df, pd.DataFrame) or df.empty:
36
+ raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
37
+ self.df = df
38
+ logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")
39
+
40
+ @cached_property
41
+ def metadata(self) -> Dict[str, Any]:
42
+ """
43
+ Extracts and caches comprehensive metadata from the DataFrame.
44
+
45
+ This property computes column types, data shape, memory usage, missing
46
+ value statistics, and high-correlation pairs. The use of
47
+ @cached_property ensures this expensive operation runs only once.
48
+
49
+ Returns:
50
+ A dictionary containing detailed dataset metadata.
51
+ """
52
+ rows, cols = self.df.shape
53
+ numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
54
+ categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
55
+ datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
56
+
57
+ # Identify potential long-form text columns for specialized analysis
58
+ text_cols = [
59
+ col for col in categorical_cols
60
+ if self.df[col].dropna().str.len().mean() > 50
61
+ ]
62
+
63
+ high_corr_pairs = []
64
+ if len(numeric_cols) > 1:
65
+ corr_matrix = self.df[numeric_cols].corr().abs()
66
+ upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
67
+ high_corr_series = upper_tri.stack()
68
+ high_corr_pairs = (
69
+ high_corr_series[high_corr_series > 0.8]
70
+ .reset_index()
71
+ .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
72
+ .to_dict('records')
73
+ )
74
+
75
+ return {
76
+ 'shape': (rows, cols),
77
+ 'columns': self.df.columns.tolist(),
78
+ 'numeric_cols': numeric_cols,
79
+ 'categorical_cols': [c for c in categorical_cols if c not in text_cols],
80
+ 'datetime_cols': datetime_cols,
81
+ 'text_cols': text_cols,
82
+ 'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
83
+ 'total_missing': int(self.df.isnull().sum().sum()),
84
+ 'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
85
+ 'high_corr_pairs': high_corr_pairs,
86
+ }
87
+
88
+ def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
89
+ """
90
+ Generates detailed profiling reports for different data types.
91
+
92
+ Returns:
93
+ A tuple containing DataFrames for missing values, numeric stats,
94
+ and categorical stats.
95
+ """
96
+ # Missing Value Report
97
+ missing = self.df.isnull().sum()
98
+ missing_df = pd.DataFrame({
99
+ 'Missing Values': missing,
100
+ 'Percentage (%)': (missing / len(self.df) * 100).round(2)
101
+ }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
102
+
103
+ # Numeric Stats Report
104
+ numeric_stats_df = pd.DataFrame()
105
+ if self.metadata['numeric_cols']:
106
+ numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
107
+ numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})
108
+
109
+ # Categorical Stats Report
110
+ cat_stats_df = pd.DataFrame()
111
+ if self.metadata['categorical_cols']:
112
+ cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
113
+ cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})
114
+
115
+ return missing_df, numeric_stats_df, cat_stats_df
116
+
117
+ def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
118
+ """
119
+ Generates a suite of overview plots for a birds-eye view of the data.
120
+
121
+ Returns:
122
+ A tuple of Plotly figures: Data Type Composition, Missing Values,
123
+ and Correlation Matrix.
124
+ """
125
+ meta = self.metadata
126
+ dtype_counts = self.df.dtypes.astype(str).value_counts()
127
+ fig_types = px.pie(
128
+ values=dtype_counts.values, names=dtype_counts.index,
129
+ title="<b>πŸ“Š Data Type Composition</b>", hole=0.4,
130
+ color_discrete_sequence=px.colors.qualitative.Pastel
131
+ )
132
+
133
+ missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
134
+ fig_missing = px.bar(
135
+ missing_df, x='index', y='count',
136
+ title="<b>πŸ•³οΈ Missing Values Distribution</b>",
137
+ labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
138
+ ).update_xaxes(categoryorder="total descending")
139
+
140
+ fig_corr = go.Figure()
141
+ if len(meta['numeric_cols']) > 1:
142
+ corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
143
+ fig_corr = px.imshow(
144
+ corr_matrix, text_auto=".2f", aspect="auto",
145
+ title="<b>πŸ”— Spearman Correlation Matrix</b>",
146
+ color_continuous_scale='RdBu_r', zmin=-1, zmax=1
147
+ )
148
+ return fig_types, fig_missing, fig_corr