Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import gradio as gr
|
4 |
+
import sqlite3
|
5 |
+
import os
|
6 |
+
from sklearn.ensemble import IsolationForest
|
7 |
+
from sklearn.preprocessing import StandardScaler
|
8 |
+
import plotly.express as px
|
9 |
+
import plotly.graph_objects as go
|
10 |
+
from datetime import datetime
|
11 |
+
import logging
|
12 |
+
|
13 |
+
# Set up logging
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
|
16 |
+
class DataQualitySystem:
|
17 |
+
def __init__(self):
|
18 |
+
self.db_name = 'data_quality.db'
|
19 |
+
self.setup_database()
|
20 |
+
|
21 |
+
def setup_database(self):
|
22 |
+
conn = sqlite3.connect(self.db_name)
|
23 |
+
cursor = conn.cursor()
|
24 |
+
|
25 |
+
cursor.execute('''
|
26 |
+
CREATE TABLE IF NOT EXISTS quality_metrics
|
27 |
+
(timestamp TEXT, metric TEXT, value REAL)
|
28 |
+
''')
|
29 |
+
|
30 |
+
cursor.execute('''
|
31 |
+
CREATE TABLE IF NOT EXISTS user_feedback
|
32 |
+
(timestamp TEXT, row_index INTEGER, feedback TEXT)
|
33 |
+
''')
|
34 |
+
|
35 |
+
conn.commit()
|
36 |
+
conn.close()
|
37 |
+
|
38 |
+
def load_and_process_data(self, file):
|
39 |
+
try:
|
40 |
+
file_path = file.name
|
41 |
+
if file_path.endswith('.csv'):
|
42 |
+
df = pd.read_csv(file_path)
|
43 |
+
elif file_path.endswith('.xlsx'):
|
44 |
+
df = pd.read_excel(file_path)
|
45 |
+
else:
|
46 |
+
return None, "Unsupported file format. Please use CSV or XLSX."
|
47 |
+
|
48 |
+
# Initial data checks
|
49 |
+
metrics = self.initial_data_checks(df)
|
50 |
+
|
51 |
+
# Anomaly detection
|
52 |
+
df_with_anomalies = self.detect_anomalies(df)
|
53 |
+
|
54 |
+
# Store metrics
|
55 |
+
self.store_quality_metrics(metrics)
|
56 |
+
|
57 |
+
return df_with_anomalies, "Data processed successfully!"
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
logging.error(f"Error processing file: {str(e)}")
|
61 |
+
return None, f"Error processing file: {str(e)}"
|
62 |
+
|
63 |
+
def initial_data_checks(self, df):
|
64 |
+
metrics = {
|
65 |
+
'total_rows': len(df),
|
66 |
+
'null_values': df.isnull().sum().sum(),
|
67 |
+
'duplicate_entries': df.duplicated().sum(),
|
68 |
+
}
|
69 |
+
|
70 |
+
# Calculate standard deviation for numeric columns
|
71 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
72 |
+
for col in numeric_cols:
|
73 |
+
metrics[f'std_dev_{col}'] = df[col].std()
|
74 |
+
|
75 |
+
return metrics
|
76 |
+
|
77 |
+
def detect_anomalies(self, df):
|
78 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
79 |
+
if len(numeric_df.columns) > 0:
|
80 |
+
scaler = StandardScaler()
|
81 |
+
scaled_data = scaler.fit_transform(numeric_df)
|
82 |
+
model = IsolationForest(contamination=0.1, random_state=42)
|
83 |
+
df['anomaly'] = model.fit_predict(scaled_data)
|
84 |
+
df['anomaly_score'] = model.score_samples(scaled_data)
|
85 |
+
return df
|
86 |
+
|
87 |
+
def store_quality_metrics(self, metrics):
|
88 |
+
conn = sqlite3.connect(self.db_name)
|
89 |
+
cursor = conn.cursor()
|
90 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
91 |
+
|
92 |
+
for metric, value in metrics.items():
|
93 |
+
cursor.execute(
|
94 |
+
'INSERT INTO quality_metrics (timestamp, metric, value) VALUES (?, ?, ?)',
|
95 |
+
(timestamp, metric, float(value))
|
96 |
+
)
|
97 |
+
|
98 |
+
conn.commit()
|
99 |
+
conn.close()
|
100 |
+
|
101 |
+
def save_feedback(self, index, feedback):
|
102 |
+
conn = sqlite3.connect(self.db_name)
|
103 |
+
cursor = conn.cursor()
|
104 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
105 |
+
|
106 |
+
cursor.execute(
|
107 |
+
'INSERT INTO user_feedback (timestamp, row_index, feedback) VALUES (?, ?, ?)',
|
108 |
+
(timestamp, index, feedback)
|
109 |
+
)
|
110 |
+
|
111 |
+
conn.commit()
|
112 |
+
conn.close()
|
113 |
+
return "Feedback saved successfully!"
|
114 |
+
|
115 |
+
def generate_report(self, df):
|
116 |
+
if df is None:
|
117 |
+
return None, None, None
|
118 |
+
|
119 |
+
# Create summary statistics plot
|
120 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
121 |
+
summary_stats = df[numeric_cols].describe()
|
122 |
+
summary_fig = go.Figure(data=[
|
123 |
+
go.Table(
|
124 |
+
header=dict(values=['Statistic'] + list(summary_stats.columns)),
|
125 |
+
cells=dict(values=[summary_stats.index] + [summary_stats[col] for col in summary_stats.columns])
|
126 |
+
)
|
127 |
+
])
|
128 |
+
|
129 |
+
# Create anomaly distribution plot
|
130 |
+
if 'anomaly_score' in df.columns:
|
131 |
+
anomaly_fig = px.histogram(df, x='anomaly_score',
|
132 |
+
title='Distribution of Anomaly Scores')
|
133 |
+
else:
|
134 |
+
anomaly_fig = None
|
135 |
+
|
136 |
+
# Create missing values plot
|
137 |
+
missing_data = df.isnull().sum()
|
138 |
+
missing_fig = px.bar(x=missing_data.index, y=missing_data.values,
|
139 |
+
title='Missing Values by Column')
|
140 |
+
|
141 |
+
return summary_fig, anomaly_fig, missing_fig
|
142 |
+
|
143 |
+
def create_gradio_interface():
|
144 |
+
system = DataQualitySystem()
|
145 |
+
|
146 |
+
def process_file(file):
|
147 |
+
df, message = system.load_and_process_data(file)
|
148 |
+
if df is not None:
|
149 |
+
summary_fig, anomaly_fig, missing_fig = system.generate_report(df)
|
150 |
+
return message, summary_fig, anomaly_fig, missing_fig
|
151 |
+
return message, None, None, None
|
152 |
+
|
153 |
+
def submit_feedback(index, feedback):
|
154 |
+
return system.save_feedback(index, feedback)
|
155 |
+
|
156 |
+
# Create the interface
|
157 |
+
with gr.Blocks() as app:
|
158 |
+
gr.Markdown("# Data Quality Assurance System"
|
159 |
+
A.Joshi 91-8847374924)
|
160 |
+
|
161 |
+
with gr.Row():
|
162 |
+
file_input = gr.File(label="Upload Data File (CSV or XLSX)")
|
163 |
+
|
164 |
+
with gr.Row():
|
165 |
+
process_btn = gr.Button("Process Data")
|
166 |
+
|
167 |
+
output_message = gr.Textbox(label="Status")
|
168 |
+
|
169 |
+
with gr.Tabs():
|
170 |
+
with gr.TabItem("Summary Statistics"):
|
171 |
+
summary_plot = gr.Plot()
|
172 |
+
with gr.TabItem("Anomaly Distribution"):
|
173 |
+
anomaly_plot = gr.Plot()
|
174 |
+
with gr.TabItem("Missing Values"):
|
175 |
+
missing_plot = gr.Plot()
|
176 |
+
|
177 |
+
with gr.Row():
|
178 |
+
feedback_index = gr.Number(label="Row Index")
|
179 |
+
feedback_text = gr.Textbox(label="Feedback")
|
180 |
+
feedback_btn = gr.Button("Submit Feedback")
|
181 |
+
|
182 |
+
# Set up event handlers
|
183 |
+
process_btn.click(
|
184 |
+
process_file,
|
185 |
+
inputs=[file_input],
|
186 |
+
outputs=[output_message, summary_plot, anomaly_plot, missing_plot]
|
187 |
+
)
|
188 |
+
|
189 |
+
feedback_btn.click(
|
190 |
+
submit_feedback,
|
191 |
+
inputs=[feedback_index, feedback_text],
|
192 |
+
outputs=[output_message]
|
193 |
+
)
|
194 |
+
|
195 |
+
return app
|
196 |
+
|
197 |
+
# Launch the interface
|
198 |
+
if __name__ == "__main__":
|
199 |
+
app = create_gradio_interface()
|
200 |
+
app.launch()
|