|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
import sqlite3 |
|
import os |
|
from sklearn.ensemble import IsolationForest |
|
from sklearn.preprocessing import StandardScaler |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from datetime import datetime |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
class DataQualitySystem: |
|
def __init__(self): |
|
self.db_name = 'data_quality.db' |
|
self.setup_database() |
|
|
|
def setup_database(self): |
|
conn = sqlite3.connect(self.db_name) |
|
cursor = conn.cursor() |
|
|
|
cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS quality_metrics |
|
(timestamp TEXT, metric TEXT, value REAL) |
|
''') |
|
|
|
cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS user_feedback |
|
(timestamp TEXT, row_index INTEGER, feedback TEXT) |
|
''') |
|
|
|
conn.commit() |
|
conn.close() |
|
|
|
def load_and_process_data(self, file): |
|
try: |
|
file_path = file.name |
|
if file_path.endswith('.csv'): |
|
df = pd.read_csv(file_path) |
|
elif file_path.endswith('.xlsx'): |
|
df = pd.read_excel(file_path) |
|
else: |
|
return None, "Unsupported file format. Please use CSV or XLSX." |
|
|
|
|
|
metrics = self.initial_data_checks(df) |
|
|
|
|
|
df_with_anomalies = self.detect_anomalies(df) |
|
|
|
|
|
self.store_quality_metrics(metrics) |
|
|
|
return df_with_anomalies, "Data processed successfully!" |
|
except Exception as e: |
|
logging.error(f"Error processing file: {str(e)}") |
|
return None, f"Error processing file: {str(e)}" |
|
|
|
def initial_data_checks(self, df): |
|
metrics = { |
|
'total_rows': len(df), |
|
'null_values': df.isnull().sum().sum(), |
|
'duplicate_entries': df.duplicated().sum(), |
|
} |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
for col in numeric_cols: |
|
metrics[f'std_dev_{col}'] = df[col].std() |
|
|
|
return metrics |
|
|
|
def detect_anomalies(self, df): |
|
numeric_df = df.select_dtypes(include=[np.number]) |
|
if len(numeric_df.columns) > 0: |
|
scaler = StandardScaler() |
|
scaled_data = scaler.fit_transform(numeric_df) |
|
model = IsolationForest(contamination=0.1, random_state=42) |
|
df['anomaly'] = model.fit_predict(scaled_data) |
|
df['anomaly_score'] = model.score_samples(scaled_data) |
|
return df |
|
|
|
def store_quality_metrics(self, metrics): |
|
conn = sqlite3.connect(self.db_name) |
|
cursor = conn.cursor() |
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
for metric, value in metrics.items(): |
|
cursor.execute( |
|
'INSERT INTO quality_metrics (timestamp, metric, value) VALUES (?, ?, ?)', |
|
(timestamp, metric, float(value)) |
|
) |
|
|
|
conn.commit() |
|
conn.close() |
|
|
|
def save_feedback(self, index, feedback): |
|
conn = sqlite3.connect(self.db_name) |
|
cursor = conn.cursor() |
|
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
cursor.execute( |
|
'INSERT INTO user_feedback (timestamp, row_index, feedback) VALUES (?, ?, ?)', |
|
(timestamp, index, feedback) |
|
) |
|
|
|
conn.commit() |
|
conn.close() |
|
return "Feedback saved successfully!" |
|
|
|
def generate_report(self, df): |
|
if df is None: |
|
return None, None, None |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
summary_stats = df[numeric_cols].describe() |
|
summary_fig = go.Figure(data=[ |
|
go.Table( |
|
header=dict(values=['Statistic'] + list(summary_stats.columns)), |
|
cells=dict(values=[summary_stats.index] + [summary_stats[col].tolist() for col in summary_stats.columns]) |
|
) |
|
]) |
|
|
|
|
|
if 'anomaly_score' in df.columns: |
|
anomaly_fig = px.histogram(df, x='anomaly_score', |
|
title='Distribution of Anomaly Scores') |
|
else: |
|
anomaly_fig = None |
|
|
|
|
|
missing_data = df.isnull().sum() |
|
missing_fig = px.bar(x=missing_data.index, y=missing_data.values, |
|
title='Missing Values by Column') |
|
|
|
return summary_fig, anomaly_fig, missing_fig |
|
|
|
def create_gradio_interface(): |
|
system = DataQualitySystem() |
|
|
|
def process_file(file): |
|
df, message = system.load_and_process_data(file) |
|
if df is not None: |
|
summary_fig, anomaly_fig, missing_fig = system.generate_report(df) |
|
return message, summary_fig, anomaly_fig, missing_fig |
|
return message, None, None, None |
|
|
|
def submit_feedback(index, feedback): |
|
return system.save_feedback(index, feedback) |
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("# Data Quality Assurance System") |
|
|
|
with gr.Row(): |
|
file_input = gr.File(label="Upload Data File (CSV or XLSX)") |
|
|
|
with gr.Row(): |
|
process_btn = gr.Button("Process Data") |
|
|
|
output_message = gr.Textbox(label="Status") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Summary Statistics"): |
|
summary_plot = gr.Plot() |
|
with gr.TabItem("Anomaly Distribution"): |
|
anomaly_plot = gr.Plot() |
|
with gr.TabItem("Missing Values"): |
|
missing_plot = gr.Plot() |
|
|
|
with gr.Row(): |
|
feedback_index = gr.Number(label="Row Index") |
|
feedback_text = gr.Textbox(label="Feedback") |
|
feedback_btn = gr.Button("Submit Feedback") |
|
|
|
|
|
process_btn.click( |
|
process_file, |
|
inputs=[file_input], |
|
outputs=[output_message, summary_plot, anomaly_plot, missing_plot] |
|
) |
|
|
|
feedback_btn.click( |
|
submit_feedback, |
|
inputs=[feedback_index, feedback_text], |
|
outputs=[output_message] |
|
) |
|
|
|
return app |
|
|
|
|
|
if __name__ == "__main__": |
|
app = create_gradio_interface() |
|
app.launch() |
|
|