Spaces:
Running
Running
File size: 7,779 Bytes
43b66f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import streamlit as st
import pandas as pd
import numpy as np
import json
from utils.dataset_utils import check_column_completeness, detect_outliers
def render_dataset_validation(dataset, dataset_type):
"""
Renders validation checks for the dataset.
Args:
dataset: The dataset to validate (pandas DataFrame)
dataset_type: The type of dataset (csv, json, etc.)
"""
if dataset is None:
st.warning("No dataset to validate.")
return
st.markdown("<h3>Dataset Validation</h3>", unsafe_allow_html=True)
# Data quality metrics
col1, col2, col3, col4 = st.columns(4)
# Calculate data quality metrics
total_cells = dataset.shape[0] * dataset.shape[1]
missing_cells = dataset.isna().sum().sum()
missing_percentage = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
duplicate_rows = dataset.duplicated().sum()
duplicate_percentage = (duplicate_rows / dataset.shape[0]) * 100 if dataset.shape[0] > 0 else 0
with col1:
st.metric("Completeness", f"{100 - missing_percentage:.2f}%")
with col2:
st.metric("Missing Values", f"{missing_cells:,} ({missing_percentage:.2f}%)")
with col3:
st.metric("Duplicate Rows", f"{duplicate_rows:,} ({duplicate_percentage:.2f}%)")
with col4:
# Quality score is a simple metric between 0-100 based on completeness and duplicates
quality_score = 100 - (missing_percentage + duplicate_percentage)
quality_score = max(0, min(100, quality_score)) # Clamp between 0 and 100
st.metric("Quality Score", f"{quality_score:.2f}/100")
# Tabs for different validation aspects
tab1, tab2 = st.tabs(["Data Quality Issues", "Anomaly Detection"])
with tab1:
st.markdown("### Data Quality Issues")
# Check for missing values by column
missing_by_col = dataset.isna().sum()
missing_by_col = missing_by_col[missing_by_col > 0]
if not missing_by_col.empty:
st.markdown("#### Missing Values by Column")
missing_df = pd.DataFrame({
'Column': missing_by_col.index,
'Missing Count': missing_by_col.values,
'Percentage': (missing_by_col.values / dataset.shape[0] * 100).round(2)
})
missing_df['Status'] = missing_df['Percentage'].apply(
lambda x: "🟢 Good" if x < 5 else ("🟠 Warning" if x < 20 else "🔴 Critical")
)
st.dataframe(
missing_df.style.format({
'Percentage': '{:.2f}%'
}).background_gradient(subset=['Percentage'], cmap='Reds'),
use_container_width=True
)
else:
st.success("No missing values found in the dataset!")
# Check for duplicate rows
if duplicate_rows > 0:
st.markdown("#### Duplicate Rows")
st.warning(f"Found {duplicate_rows} duplicate rows ({duplicate_percentage:.2f}% of the dataset)")
# Option to show duplicates
if st.checkbox("Show duplicates"):
st.dataframe(dataset[dataset.duplicated(keep='first')], use_container_width=True)
else:
st.success("No duplicate rows found in the dataset!")
# Check column data types
st.markdown("#### Column Data Types")
type_issues = []
for col in dataset.columns:
dtype = dataset[col].dtype
if dtype == 'object':
# Check if it could be numeric
try:
# Try to convert a sample to numeric
sample = dataset[col].dropna().head(100)
if len(sample) > 0:
numeric_count = pd.to_numeric(sample, errors='coerce').notna().sum()
if numeric_count / len(sample) > 0.8: # If more than 80% can be converted
type_issues.append({
'Column': col,
'Current Type': 'object',
'Suggested Type': 'numeric',
'Issue': 'Column contains mostly numeric values but is stored as text'
})
continue
except:
pass
# Check if it could be datetime
try:
sample = dataset[col].dropna().head(100)
if len(sample) > 0:
datetime_count = pd.to_datetime(sample, errors='coerce').notna().sum()
if datetime_count / len(sample) > 0.8: # If more than 80% can be converted
type_issues.append({
'Column': col,
'Current Type': 'object',
'Suggested Type': 'datetime',
'Issue': 'Column contains mostly dates but is stored as text'
})
except:
pass
if type_issues:
st.dataframe(pd.DataFrame(type_issues), use_container_width=True)
else:
st.success("No data type issues detected!")
# Check for column completeness
st.markdown("#### Column Completeness Check")
completeness_results = check_column_completeness(dataset)
if completeness_results:
st.dataframe(pd.DataFrame(completeness_results), use_container_width=True)
else:
st.success("All columns have good completeness!")
with tab2:
st.markdown("### Anomaly Detection")
# Detect outliers in numeric columns
numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
selected_num_col = st.selectbox("Select column to check for outliers", numeric_cols)
outliers, lower_bound, upper_bound = detect_outliers(dataset[selected_num_col])
outlier_percentage = (len(outliers) / len(dataset)) * 100
st.markdown(f"#### Outliers in column: {selected_num_col}")
st.metric("Outliers Detected", f"{len(outliers)} ({outlier_percentage:.2f}%)")
st.markdown(f"""
**Bounds for outlier detection:**
- Lower bound: {lower_bound:.4f}
- Upper bound: {upper_bound:.4f}
""")
if len(outliers) > 0:
# Plot with outliers highlighted
import plotly.express as px
# Create a new column for coloring
temp_df = dataset.copy()
temp_df['is_outlier'] = temp_df.index.isin(outliers)
fig = px.box(
temp_df,
y=selected_num_col,
color='is_outlier',
color_discrete_map={True: "#FF5757", False: "#2563EB"},
title=f"Outliers in {selected_num_col}",
labels={"is_outlier": "Is Outlier"}
)
st.plotly_chart(fig, use_container_width=True)
# Option to show outliers in table
if st.checkbox("Show outlier data"):
st.dataframe(dataset.loc[outliers], use_container_width=True)
else:
st.success(f"No outliers detected in {selected_num_col}!")
else:
st.warning("No numeric columns found for outlier detection.")
|