Spaces:

whackthejacker
/

DataHubHub

Running

File size: 7,779 Bytes

43b66f1

import streamlit as st
import pandas as pd
import numpy as np
import json
from utils.dataset_utils import check_column_completeness, detect_outliers

def render_dataset_validation(dataset, dataset_type):
    """
    Renders validation checks for the dataset.
    
    Args:
        dataset: The dataset to validate (pandas DataFrame)
        dataset_type: The type of dataset (csv, json, etc.)
    """
    if dataset is None:
        st.warning("No dataset to validate.")
        return
    
    st.markdown("<h3>Dataset Validation</h3>", unsafe_allow_html=True)
    
    # Data quality metrics
    col1, col2, col3, col4 = st.columns(4)
    
    # Calculate data quality metrics
    total_cells = dataset.shape[0] * dataset.shape[1]
    missing_cells = dataset.isna().sum().sum()
    missing_percentage = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
    duplicate_rows = dataset.duplicated().sum()
    duplicate_percentage = (duplicate_rows / dataset.shape[0]) * 100 if dataset.shape[0] > 0 else 0
    
    with col1:
        st.metric("Completeness", f"{100 - missing_percentage:.2f}%")
    with col2:
        st.metric("Missing Values", f"{missing_cells:,} ({missing_percentage:.2f}%)")
    with col3:
        st.metric("Duplicate Rows", f"{duplicate_rows:,} ({duplicate_percentage:.2f}%)")
    with col4:
        # Quality score is a simple metric between 0-100 based on completeness and duplicates
        quality_score = 100 - (missing_percentage + duplicate_percentage)
        quality_score = max(0, min(100, quality_score))  # Clamp between 0 and 100
        st.metric("Quality Score", f"{quality_score:.2f}/100")
    
    # Tabs for different validation aspects
    tab1, tab2 = st.tabs(["Data Quality Issues", "Anomaly Detection"])
    
    with tab1:
        st.markdown("### Data Quality Issues")
        
        # Check for missing values by column
        missing_by_col = dataset.isna().sum()
        missing_by_col = missing_by_col[missing_by_col > 0]
        
        if not missing_by_col.empty:
            st.markdown("#### Missing Values by Column")
            missing_df = pd.DataFrame({
                'Column': missing_by_col.index,
                'Missing Count': missing_by_col.values,
                'Percentage': (missing_by_col.values / dataset.shape[0] * 100).round(2)
            })
            missing_df['Status'] = missing_df['Percentage'].apply(
                lambda x: "🟢 Good" if x < 5 else ("🟠 Warning" if x < 20 else "🔴 Critical")
            )
            
            st.dataframe(
                missing_df.style.format({
                    'Percentage': '{:.2f}%'
                }).background_gradient(subset=['Percentage'], cmap='Reds'),
                use_container_width=True
            )
        else:
            st.success("No missing values found in the dataset!")
        
        # Check for duplicate rows
        if duplicate_rows > 0:
            st.markdown("#### Duplicate Rows")
            st.warning(f"Found {duplicate_rows} duplicate rows ({duplicate_percentage:.2f}% of the dataset)")
            
            # Option to show duplicates
            if st.checkbox("Show duplicates"):
                st.dataframe(dataset[dataset.duplicated(keep='first')], use_container_width=True)
        else:
            st.success("No duplicate rows found in the dataset!")
        
        # Check column data types
        st.markdown("#### Column Data Types")
        type_issues = []
        
        for col in dataset.columns:
            dtype = dataset[col].dtype
            if dtype == 'object':
                # Check if it could be numeric
                try:
                    # Try to convert a sample to numeric
                    sample = dataset[col].dropna().head(100)
                    if len(sample) > 0:
                        numeric_count = pd.to_numeric(sample, errors='coerce').notna().sum()
                        if numeric_count / len(sample) > 0.8:  # If more than 80% can be converted
                            type_issues.append({
                                'Column': col,
                                'Current Type': 'object',
                                'Suggested Type': 'numeric',
                                'Issue': 'Column contains mostly numeric values but is stored as text'
                            })
                            continue
                except:
                    pass
                
                # Check if it could be datetime
                try:
                    sample = dataset[col].dropna().head(100)
                    if len(sample) > 0:
                        datetime_count = pd.to_datetime(sample, errors='coerce').notna().sum()
                        if datetime_count / len(sample) > 0.8:  # If more than 80% can be converted
                            type_issues.append({
                                'Column': col,
                                'Current Type': 'object',
                                'Suggested Type': 'datetime',
                                'Issue': 'Column contains mostly dates but is stored as text'
                            })
                except:
                    pass
        
        if type_issues:
            st.dataframe(pd.DataFrame(type_issues), use_container_width=True)
        else:
            st.success("No data type issues detected!")
        
        # Check for column completeness
        st.markdown("#### Column Completeness Check")
        completeness_results = check_column_completeness(dataset)
        if completeness_results:
            st.dataframe(pd.DataFrame(completeness_results), use_container_width=True)
        else:
            st.success("All columns have good completeness!")
    
    with tab2:
        st.markdown("### Anomaly Detection")
        
        # Detect outliers in numeric columns
        numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
        
        if numeric_cols:
            selected_num_col = st.selectbox("Select column to check for outliers", numeric_cols)
            
            outliers, lower_bound, upper_bound = detect_outliers(dataset[selected_num_col])
            outlier_percentage = (len(outliers) / len(dataset)) * 100
            
            st.markdown(f"#### Outliers in column: {selected_num_col}")
            st.metric("Outliers Detected", f"{len(outliers)} ({outlier_percentage:.2f}%)")
            
            st.markdown(f"""
            **Bounds for outlier detection:**
            - Lower bound: {lower_bound:.4f}
            - Upper bound: {upper_bound:.4f}
            """)
            
            if len(outliers) > 0:
                # Plot with outliers highlighted
                import plotly.express as px
                
                # Create a new column for coloring
                temp_df = dataset.copy()
                temp_df['is_outlier'] = temp_df.index.isin(outliers)
                
                fig = px.box(
                    temp_df, 
                    y=selected_num_col,
                    color='is_outlier',
                    color_discrete_map={True: "#FF5757", False: "#2563EB"},
                    title=f"Outliers in {selected_num_col}",
                    labels={"is_outlier": "Is Outlier"}
                )
                st.plotly_chart(fig, use_container_width=True)
                
                # Option to show outliers in table
                if st.checkbox("Show outlier data"):
                    st.dataframe(dataset.loc[outliers], use_container_width=True)
            else:
                st.success(f"No outliers detected in {selected_num_col}!")
        else:
            st.warning("No numeric columns found for outlier detection.")