import pandas as pd import numpy as np import gradio as gr import sqlite3 import os from sklearn.ensemble import IsolationForest from sklearn.preprocessing import StandardScaler import plotly.express as px import plotly.graph_objects as go from datetime import datetime import logging # Set up logging logging.basicConfig(level=logging.INFO) class DataQualitySystem: def __init__(self): self.db_name = 'data_quality.db' self.setup_database() def setup_database(self): conn = sqlite3.connect(self.db_name) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS quality_metrics (timestamp TEXT, metric TEXT, value REAL) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS user_feedback (timestamp TEXT, row_index INTEGER, feedback TEXT) ''') conn.commit() conn.close() def load_and_process_data(self, file): try: file_path = file.name # This should work for both CSV and XLSX files if file_path.endswith('.csv'): df = pd.read_csv(file_path) elif file_path.endswith('.xlsx'): df = pd.read_excel(file_path) else: return None, "Unsupported file format. Please use CSV or XLSX." # Initial data checks metrics = self.initial_data_checks(df) # Anomaly detection df_with_anomalies = self.detect_anomalies(df) # Store quality metrics self.store_quality_metrics(metrics) return df_with_anomalies, "Data processed successfully!" except Exception as e: logging.error(f"Error processing file: {str(e)}") return None, f"Error processing file: {str(e)}" def initial_data_checks(self, df): metrics = { 'total_rows': len(df), 'null_values': df.isnull().sum().sum(), 'duplicate_entries': df.duplicated().sum(), } # Calculate standard deviation for numeric columns numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: metrics[f'std_dev_{col}'] = df[col].std() return metrics def detect_anomalies(self, df): numeric_df = df.select_dtypes(include=[np.number]) if len(numeric_df.columns) > 0: scaler = StandardScaler() scaled_data = scaler.fit_transform(numeric_df) model = IsolationForest(contamination=0.1, random_state=42) df['anomaly'] = model.fit_predict(scaled_data) df['anomaly_score'] = model.score_samples(scaled_data) return df def store_quality_metrics(self, metrics): conn = sqlite3.connect(self.db_name) cursor = conn.cursor() timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') for metric, value in metrics.items(): cursor.execute( 'INSERT INTO quality_metrics (timestamp, metric, value) VALUES (?, ?, ?)', (timestamp, metric, float(value)) ) conn.commit() conn.close() def save_feedback(self, index, feedback): conn = sqlite3.connect(self.db_name) cursor = conn.cursor() timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') cursor.execute( 'INSERT INTO user_feedback (timestamp, row_index, feedback) VALUES (?, ?, ?)', (timestamp, index, feedback) ) conn.commit() conn.close() return "Feedback saved successfully!" def generate_report(self, df): if df is None: return None, None, None # Create summary statistics plot numeric_cols = df.select_dtypes(include=[np.number]).columns summary_stats = df[numeric_cols].describe() summary_fig = go.Figure(data=[ go.Table( header=dict(values=['Statistic'] + list(summary_stats.columns)), cells=dict(values=[summary_stats.index] + [summary_stats[col].tolist() for col in summary_stats.columns]) ) ]) # Create anomaly distribution plot if 'anomaly_score' in df.columns: anomaly_fig = px.histogram(df, x='anomaly_score', title='Distribution of Anomaly Scores') else: anomaly_fig = None # Create missing values plot missing_data = df.isnull().sum() missing_fig = px.bar(x=missing_data.index, y=missing_data.values, title='Missing Values by Column') return summary_fig, anomaly_fig, missing_fig def create_gradio_interface(): system = DataQualitySystem() def process_file(file): df, message = system.load_and_process_data(file) if df is not None: summary_fig, anomaly_fig, missing_fig = system.generate_report(df) return message, summary_fig, anomaly_fig, missing_fig return message, None, None, None def submit_feedback(index, feedback): return system.save_feedback(index, feedback) # Create the interface with gr.Blocks() as app: gr.Markdown("# Data Quality Assurance System") with gr.Row(): file_input = gr.File(label="Upload Data File (CSV or XLSX)") with gr.Row(): process_btn = gr.Button("Process Data") output_message = gr.Textbox(label="Status") with gr.Tabs(): with gr.TabItem("Summary Statistics"): summary_plot = gr.Plot() with gr.TabItem("Anomaly Distribution"): anomaly_plot = gr.Plot() with gr.TabItem("Missing Values"): missing_plot = gr.Plot() with gr.Row(): feedback_index = gr.Number(label="Row Index") feedback_text = gr.Textbox(label="Feedback") feedback_btn = gr.Button("Submit Feedback") # Set up event handlers process_btn.click( process_file, inputs=[file_input], outputs=[output_message, summary_plot, anomaly_plot, missing_plot] ) feedback_btn.click( submit_feedback, inputs=[feedback_index, feedback_text], outputs=[output_message] ) return app # Launch the interface if __name__ == "__main__": app = create_gradio_interface() app.launch()