Anupam251272 commited on
Commit
4dca928
·
verified ·
1 Parent(s): f2b76a8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -0
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import gradio as gr
4
+ import sqlite3
5
+ import os
6
+ from sklearn.ensemble import IsolationForest
7
+ from sklearn.preprocessing import StandardScaler
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ from datetime import datetime
11
+ import logging
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ class DataQualitySystem:
17
+ def __init__(self):
18
+ self.db_name = 'data_quality.db'
19
+ self.setup_database()
20
+
21
+ def setup_database(self):
22
+ conn = sqlite3.connect(self.db_name)
23
+ cursor = conn.cursor()
24
+
25
+ cursor.execute('''
26
+ CREATE TABLE IF NOT EXISTS quality_metrics
27
+ (timestamp TEXT, metric TEXT, value REAL)
28
+ ''')
29
+
30
+ cursor.execute('''
31
+ CREATE TABLE IF NOT EXISTS user_feedback
32
+ (timestamp TEXT, row_index INTEGER, feedback TEXT)
33
+ ''')
34
+
35
+ conn.commit()
36
+ conn.close()
37
+
38
+ def load_and_process_data(self, file):
39
+ try:
40
+ file_path = file.name
41
+ if file_path.endswith('.csv'):
42
+ df = pd.read_csv(file_path)
43
+ elif file_path.endswith('.xlsx'):
44
+ df = pd.read_excel(file_path)
45
+ else:
46
+ return None, "Unsupported file format. Please use CSV or XLSX."
47
+
48
+ # Initial data checks
49
+ metrics = self.initial_data_checks(df)
50
+
51
+ # Anomaly detection
52
+ df_with_anomalies = self.detect_anomalies(df)
53
+
54
+ # Store metrics
55
+ self.store_quality_metrics(metrics)
56
+
57
+ return df_with_anomalies, "Data processed successfully!"
58
+
59
+ except Exception as e:
60
+ logging.error(f"Error processing file: {str(e)}")
61
+ return None, f"Error processing file: {str(e)}"
62
+
63
+ def initial_data_checks(self, df):
64
+ metrics = {
65
+ 'total_rows': len(df),
66
+ 'null_values': df.isnull().sum().sum(),
67
+ 'duplicate_entries': df.duplicated().sum(),
68
+ }
69
+
70
+ # Calculate standard deviation for numeric columns
71
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
72
+ for col in numeric_cols:
73
+ metrics[f'std_dev_{col}'] = df[col].std()
74
+
75
+ return metrics
76
+
77
+ def detect_anomalies(self, df):
78
+ numeric_df = df.select_dtypes(include=[np.number])
79
+ if len(numeric_df.columns) > 0:
80
+ scaler = StandardScaler()
81
+ scaled_data = scaler.fit_transform(numeric_df)
82
+ model = IsolationForest(contamination=0.1, random_state=42)
83
+ df['anomaly'] = model.fit_predict(scaled_data)
84
+ df['anomaly_score'] = model.score_samples(scaled_data)
85
+ return df
86
+
87
+ def store_quality_metrics(self, metrics):
88
+ conn = sqlite3.connect(self.db_name)
89
+ cursor = conn.cursor()
90
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
91
+
92
+ for metric, value in metrics.items():
93
+ cursor.execute(
94
+ 'INSERT INTO quality_metrics (timestamp, metric, value) VALUES (?, ?, ?)',
95
+ (timestamp, metric, float(value))
96
+ )
97
+
98
+ conn.commit()
99
+ conn.close()
100
+
101
+ def save_feedback(self, index, feedback):
102
+ conn = sqlite3.connect(self.db_name)
103
+ cursor = conn.cursor()
104
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
105
+
106
+ cursor.execute(
107
+ 'INSERT INTO user_feedback (timestamp, row_index, feedback) VALUES (?, ?, ?)',
108
+ (timestamp, index, feedback)
109
+ )
110
+
111
+ conn.commit()
112
+ conn.close()
113
+ return "Feedback saved successfully!"
114
+
115
+ def generate_report(self, df):
116
+ if df is None:
117
+ return None, None, None
118
+
119
+ # Create summary statistics plot
120
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
121
+ summary_stats = df[numeric_cols].describe()
122
+ summary_fig = go.Figure(data=[
123
+ go.Table(
124
+ header=dict(values=['Statistic'] + list(summary_stats.columns)),
125
+ cells=dict(values=[summary_stats.index] + [summary_stats[col] for col in summary_stats.columns])
126
+ )
127
+ ])
128
+
129
+ # Create anomaly distribution plot
130
+ if 'anomaly_score' in df.columns:
131
+ anomaly_fig = px.histogram(df, x='anomaly_score',
132
+ title='Distribution of Anomaly Scores')
133
+ else:
134
+ anomaly_fig = None
135
+
136
+ # Create missing values plot
137
+ missing_data = df.isnull().sum()
138
+ missing_fig = px.bar(x=missing_data.index, y=missing_data.values,
139
+ title='Missing Values by Column')
140
+
141
+ return summary_fig, anomaly_fig, missing_fig
142
+
143
+ def create_gradio_interface():
144
+ system = DataQualitySystem()
145
+
146
+ def process_file(file):
147
+ df, message = system.load_and_process_data(file)
148
+ if df is not None:
149
+ summary_fig, anomaly_fig, missing_fig = system.generate_report(df)
150
+ return message, summary_fig, anomaly_fig, missing_fig
151
+ return message, None, None, None
152
+
153
+ def submit_feedback(index, feedback):
154
+ return system.save_feedback(index, feedback)
155
+
156
+ # Create the interface
157
+ with gr.Blocks() as app:
158
+ gr.Markdown("# Data Quality Assurance System"
159
+ A.Joshi 91-8847374924)
160
+
161
+ with gr.Row():
162
+ file_input = gr.File(label="Upload Data File (CSV or XLSX)")
163
+
164
+ with gr.Row():
165
+ process_btn = gr.Button("Process Data")
166
+
167
+ output_message = gr.Textbox(label="Status")
168
+
169
+ with gr.Tabs():
170
+ with gr.TabItem("Summary Statistics"):
171
+ summary_plot = gr.Plot()
172
+ with gr.TabItem("Anomaly Distribution"):
173
+ anomaly_plot = gr.Plot()
174
+ with gr.TabItem("Missing Values"):
175
+ missing_plot = gr.Plot()
176
+
177
+ with gr.Row():
178
+ feedback_index = gr.Number(label="Row Index")
179
+ feedback_text = gr.Textbox(label="Feedback")
180
+ feedback_btn = gr.Button("Submit Feedback")
181
+
182
+ # Set up event handlers
183
+ process_btn.click(
184
+ process_file,
185
+ inputs=[file_input],
186
+ outputs=[output_message, summary_plot, anomaly_plot, missing_plot]
187
+ )
188
+
189
+ feedback_btn.click(
190
+ submit_feedback,
191
+ inputs=[feedback_index, feedback_text],
192
+ outputs=[output_message]
193
+ )
194
+
195
+ return app
196
+
197
+ # Launch the interface
198
+ if __name__ == "__main__":
199
+ app = create_gradio_interface()
200
+ app.launch()