File size: 6,288 Bytes
2a9e2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# core/analyzer.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  The core data analysis engine. This module is responsible for all
#               backend data profiling and statistical computation. It is fully
#               decoupled from any UI framework.

from __future__ import annotations
import logging
from typing import Any, Dict, List, Tuple
from functools import cached_property

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from core.exceptions import DataProcessingError

class DataAnalyzer:
    """
    A sophisticated data analysis and profiling engine.

    This class encapsulates all the logic for computing statistics, metadata,
    and generating visualizations from a pandas DataFrame. It leverages
    cached properties for efficient re-computation of metadata.

    Args:
        df (pd.DataFrame): The input DataFrame for analysis.
    """
    def __init__(self, df: pd.DataFrame):
        if not isinstance(df, pd.DataFrame) or df.empty:
            raise DataProcessingError("Input must be a non-empty pandas DataFrame.")
        self.df = df
        logging.info(f"DataAnalyzer instantiated with DataFrame of shape: {self.df.shape}")

    @cached_property
    def metadata(self) -> Dict[str, Any]:
        """
        Extracts and caches comprehensive metadata from the DataFrame.

        This property computes column types, data shape, memory usage, missing
        value statistics, and high-correlation pairs. The use of
        @cached_property ensures this expensive operation runs only once.

        Returns:
            A dictionary containing detailed dataset metadata.
        """
        rows, cols = self.df.shape
        numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
        categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
        datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()

        # Identify potential long-form text columns for specialized analysis
        text_cols = [
            col for col in categorical_cols
            if self.df[col].dropna().str.len().mean() > 50
        ]
        
        high_corr_pairs = []
        if len(numeric_cols) > 1:
            corr_matrix = self.df[numeric_cols].corr().abs()
            upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
            high_corr_series = upper_tri.stack()
            high_corr_pairs = (
                high_corr_series[high_corr_series > 0.8]
                .reset_index()
                .rename(columns={'level_0': 'Feature 1', 'level_1': 'Feature 2', 0: 'Correlation'})
                .to_dict('records')
            )

        return {
            'shape': (rows, cols),
            'columns': self.df.columns.tolist(),
            'numeric_cols': numeric_cols,
            'categorical_cols': [c for c in categorical_cols if c not in text_cols],
            'datetime_cols': datetime_cols,
            'text_cols': text_cols,
            'memory_usage_mb': f"{self.df.memory_usage(deep=True).sum() / 1e6:.2f}",
            'total_missing': int(self.df.isnull().sum().sum()),
            'data_quality_score': round((self.df.notna().sum().sum() / self.df.size) * 100, 2),
            'high_corr_pairs': high_corr_pairs,
        }

    def get_profiling_reports(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Generates detailed profiling reports for different data types.

        Returns:
            A tuple containing DataFrames for missing values, numeric stats,
            and categorical stats.
        """
        # Missing Value Report
        missing = self.df.isnull().sum()
        missing_df = pd.DataFrame({
            'Missing Values': missing,
            'Percentage (%)': (missing / len(self.df) * 100).round(2)
        }).reset_index().rename(columns={'index': 'Column'}).sort_values('Missing Values', ascending=False)
        
        # Numeric Stats Report
        numeric_stats_df = pd.DataFrame()
        if self.metadata['numeric_cols']:
            numeric_stats = self.df[self.metadata['numeric_cols']].describe(percentiles=[.01, .25, .5, .75, .99]).T
            numeric_stats_df = numeric_stats.round(3).reset_index().rename(columns={'index': 'Feature'})

        # Categorical Stats Report
        cat_stats_df = pd.DataFrame()
        if self.metadata['categorical_cols']:
            cat_stats = self.df[self.metadata['categorical_cols']].describe(include=['object', 'category']).T
            cat_stats_df = cat_stats.reset_index().rename(columns={'index': 'Feature'})
            
        return missing_df, numeric_stats_df, cat_stats_df

    def get_overview_visuals(self) -> Tuple[go.Figure, go.Figure, go.Figure]:
        """
        Generates a suite of overview plots for a birds-eye view of the data.

        Returns:
            A tuple of Plotly figures: Data Type Composition, Missing Values,
            and Correlation Matrix.
        """
        meta = self.metadata
        dtype_counts = self.df.dtypes.astype(str).value_counts()
        fig_types = px.pie(
            values=dtype_counts.values, names=dtype_counts.index,
            title="<b>πŸ“Š Data Type Composition</b>", hole=0.4,
            color_discrete_sequence=px.colors.qualitative.Pastel
        )

        missing_df = self.df.isnull().sum().reset_index(name='count').query('count > 0')
        fig_missing = px.bar(
            missing_df, x='index', y='count',
            title="<b>πŸ•³οΈ Missing Values Distribution</b>",
            labels={'index': 'Column Name', 'count': 'Number of Missing Values'}
        ).update_xaxes(categoryorder="total descending")

        fig_corr = go.Figure()
        if len(meta['numeric_cols']) > 1:
            corr_matrix = self.df[meta['numeric_cols']].corr(method='spearman') # More robust to outliers
            fig_corr = px.imshow(
                corr_matrix, text_auto=".2f", aspect="auto",
                title="<b>πŸ”— Spearman Correlation Matrix</b>",
                color_continuous_scale='RdBu_r', zmin=-1, zmax=1
            )
        return fig_types, fig_missing, fig_corr