Spaces:

Anupam251272
/

DataQualityAssuranceSystem

Sleeping

App Files Files Community

Anupam251272 commited on Nov 13, 2024

Commit

4dca928

verified ·

1 Parent(s): f2b76a8

Create app.py

Browse files

Files changed (1) hide show

app.py +200 -0

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import pandas as pd
+import numpy as np
+import gradio as gr
+import sqlite3
+import os
+from sklearn.ensemble import IsolationForest
+from sklearn.preprocessing import StandardScaler
+import plotly.express as px
+import plotly.graph_objects as go
+from datetime import datetime
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+class DataQualitySystem:
+    def __init__(self):
+        self.db_name = 'data_quality.db'
+        self.setup_database()
+    def setup_database(self):
+        conn = sqlite3.connect(self.db_name)
+        cursor = conn.cursor()
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS quality_metrics
+            (timestamp TEXT, metric TEXT, value REAL)
+        ''')
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS user_feedback
+            (timestamp TEXT, row_index INTEGER, feedback TEXT)
+        ''')
+        conn.commit()
+        conn.close()
+    def load_and_process_data(self, file):
+    try:
+        file_path = file.name
+        if file_path.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        elif file_path.endswith('.xlsx'):
+            df = pd.read_excel(file_path)
+        else:
+            return None, "Unsupported file format. Please use CSV or XLSX."
+            # Initial data checks
+            metrics = self.initial_data_checks(df)
+            # Anomaly detection
+            df_with_anomalies = self.detect_anomalies(df)
+            # Store metrics
+            self.store_quality_metrics(metrics)
+            return df_with_anomalies, "Data processed successfully!"
+        except Exception as e:
+            logging.error(f"Error processing file: {str(e)}")
+            return None, f"Error processing file: {str(e)}"
+    def initial_data_checks(self, df):
+        metrics = {
+            'total_rows': len(df),
+            'null_values': df.isnull().sum().sum(),
+            'duplicate_entries': df.duplicated().sum(),
+        }
+        # Calculate standard deviation for numeric columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            metrics[f'std_dev_{col}'] = df[col].std()
+        return metrics
+    def detect_anomalies(self, df):
+        numeric_df = df.select_dtypes(include=[np.number])
+        if len(numeric_df.columns) > 0:
+            scaler = StandardScaler()
+            scaled_data = scaler.fit_transform(numeric_df)
+            model = IsolationForest(contamination=0.1, random_state=42)
+            df['anomaly'] = model.fit_predict(scaled_data)
+            df['anomaly_score'] = model.score_samples(scaled_data)
+        return df
+    def store_quality_metrics(self, metrics):
+        conn = sqlite3.connect(self.db_name)
+        cursor = conn.cursor()
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        for metric, value in metrics.items():
+            cursor.execute(
+                'INSERT INTO quality_metrics (timestamp, metric, value) VALUES (?, ?, ?)',
+                (timestamp, metric, float(value))
+            )
+        conn.commit()
+        conn.close()
+    def save_feedback(self, index, feedback):
+        conn = sqlite3.connect(self.db_name)
+        cursor = conn.cursor()
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        cursor.execute(
+            'INSERT INTO user_feedback (timestamp, row_index, feedback) VALUES (?, ?, ?)',
+            (timestamp, index, feedback)
+        )
+        conn.commit()
+        conn.close()
+        return "Feedback saved successfully!"
+    def generate_report(self, df):
+        if df is None:
+            return None, None, None
+        # Create summary statistics plot
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        summary_stats = df[numeric_cols].describe()
+        summary_fig = go.Figure(data=[
+            go.Table(
+                header=dict(values=['Statistic'] + list(summary_stats.columns)),
+                cells=dict(values=[summary_stats.index] + [summary_stats[col] for col in summary_stats.columns])
+            )
+        ])
+        # Create anomaly distribution plot
+        if 'anomaly_score' in df.columns:
+            anomaly_fig = px.histogram(df, x='anomaly_score',
+                                     title='Distribution of Anomaly Scores')
+        else:
+            anomaly_fig = None
+        # Create missing values plot
+        missing_data = df.isnull().sum()
+        missing_fig = px.bar(x=missing_data.index, y=missing_data.values,
+                            title='Missing Values by Column')
+        return summary_fig, anomaly_fig, missing_fig
+def create_gradio_interface():
+    system = DataQualitySystem()
+    def process_file(file):
+        df, message = system.load_and_process_data(file)
+        if df is not None:
+            summary_fig, anomaly_fig, missing_fig = system.generate_report(df)
+            return message, summary_fig, anomaly_fig, missing_fig
+        return message, None, None, None
+    def submit_feedback(index, feedback):
+        return system.save_feedback(index, feedback)
+    # Create the interface
+    with gr.Blocks() as app:
+        gr.Markdown("# Data Quality Assurance System"
+                    A.Joshi 91-8847374924)
+        with gr.Row():
+            file_input = gr.File(label="Upload Data File (CSV or XLSX)")
+        with gr.Row():
+            process_btn = gr.Button("Process Data")
+        output_message = gr.Textbox(label="Status")
+        with gr.Tabs():
+            with gr.TabItem("Summary Statistics"):
+                summary_plot = gr.Plot()
+            with gr.TabItem("Anomaly Distribution"):
+                anomaly_plot = gr.Plot()
+            with gr.TabItem("Missing Values"):
+                missing_plot = gr.Plot()
+        with gr.Row():
+            feedback_index = gr.Number(label="Row Index")
+            feedback_text = gr.Textbox(label="Feedback")
+            feedback_btn = gr.Button("Submit Feedback")
+        # Set up event handlers
+        process_btn.click(
+            process_file,
+            inputs=[file_input],
+            outputs=[output_message, summary_plot, anomaly_plot, missing_plot]
+        )
+        feedback_btn.click(
+            submit_feedback,
+            inputs=[feedback_index, feedback_text],
+            outputs=[output_message]
+        )
+    return app
+# Launch the interface
+if __name__ == "__main__":
+    app = create_gradio_interface()
+    app.launch()