pgurazada1 commited on
Commit
3137083
·
verified ·
1 Parent(s): 0c5aeb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -26
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import time
2
  import math
3
 
 
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
@@ -14,6 +15,31 @@ from sklearn.metrics import classification_report
14
  LOGS_DATASET_URI = 'pgurazada1/machine-failure-mlops-demo-logs'
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def get_data():
18
  """
19
  Connect to the HuggingFace dataset where the logs are stored.
@@ -24,38 +50,19 @@ def get_data():
24
 
25
  return sample_df
26
 
27
- def load_training_data():
28
- dataset = fetch_openml(data_id=42890, as_frame=True, parser="auto")
29
- data_df = dataset.data
30
 
31
- target = 'Machine failure'
32
- numeric_features = [
33
- 'Air temperature [K]',
34
- 'Process temperature [K]',
35
- 'Rotational speed [rpm]',
36
- 'Torque [Nm]',
37
- 'Tool wear [min]'
38
- ]
39
-
40
- categorical_features = ['Type']
41
-
42
- X = data_df[numeric_features + categorical_features]
43
- y = data_df[target]
44
-
45
- Xtrain, Xtest, ytrain, ytest = train_test_split(
46
- X, y,
47
- test_size=0.2,
48
- random_state=42
49
- )
50
-
51
- return Xtrain, ytrain
52
-
53
  def check_model_drift():
 
 
 
 
 
54
  sample_df = get_data()
55
  p_pos_label_training_data = 0.03475
56
  training_data_size = 8000
57
 
58
  n_0 = sample_df.prediction.value_counts()[0]
 
59
  try:
60
  n_1 = sample_df.prediction.value_counts()[1]
61
  except Exception as e:
@@ -67,11 +74,68 @@ def check_model_drift():
67
  p_diff = abs(p_pos_label_training_data - p_pos_label_sample_logs)
68
 
69
  if p_diff > 2 * math.sqrt(variance):
70
- return "Model Drift Detected! Check logs!"
71
  else:
72
  return "No Model Drift!"
73
 
 
 
 
 
 
 
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  with gr.Blocks() as demo:
76
  gr.Markdown("# Real-time Monitoring Dashboard")
77
 
@@ -81,4 +145,11 @@ with gr.Blocks() as demo:
81
  with gr.Column():
82
  gr.Textbox(check_model_drift, every=5, label="Model Drift Status")
83
 
 
 
 
 
 
 
 
84
  demo.queue().launch()
 
1
  import time
2
  import math
3
 
4
+ import numpy as np
5
  import pandas as pd
6
  import matplotlib.pyplot as plt
7
  import seaborn as sns
 
15
  LOGS_DATASET_URI = 'pgurazada1/machine-failure-mlops-demo-logs'
16
 
17
 
18
+ # Load and cache training data
19
+
20
+ dataset = fetch_openml(data_id=42890, as_frame=True, parser="auto")
21
+ data_df = dataset.data
22
+
23
+ target = 'Machine failure'
24
+ numeric_features = [
25
+ 'Air temperature [K]',
26
+ 'Process temperature [K]',
27
+ 'Rotational speed [rpm]',
28
+ 'Torque [Nm]',
29
+ 'Tool wear [min]'
30
+ ]
31
+
32
+ categorical_features = ['Type']
33
+
34
+ X = data_df[numeric_features + categorical_features]
35
+ y = data_df[target]
36
+
37
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
38
+ X, y,
39
+ test_size=0.2,
40
+ random_state=42
41
+ )
42
+
43
  def get_data():
44
  """
45
  Connect to the HuggingFace dataset where the logs are stored.
 
50
 
51
  return sample_df
52
 
 
 
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def check_model_drift():
55
+ """
56
+ Check proportion of machine failure as compared to
57
+ its proportion in training data. If the deviation is more than
58
+ 2 standard deviations, flag a model drift.
59
+ """
60
  sample_df = get_data()
61
  p_pos_label_training_data = 0.03475
62
  training_data_size = 8000
63
 
64
  n_0 = sample_df.prediction.value_counts()[0]
65
+
66
  try:
67
  n_1 = sample_df.prediction.value_counts()[1]
68
  except Exception as e:
 
74
  p_diff = abs(p_pos_label_training_data - p_pos_label_sample_logs)
75
 
76
  if p_diff > 2 * math.sqrt(variance):
77
+ return "Model Drift Detected! Check Logs!"
78
  else:
79
  return "No Model Drift!"
80
 
81
+
82
+ def psi(actual_proportions, expected_proportions):
83
+
84
+ psi_values = (actual_proportions - expected_proportions) * \
85
+ np.log(actual_proportions / expected_proportions)
86
+
87
+ return sum(psi_values)
88
 
89
+
90
+ def check_data_drift():
91
+ """
92
+ Compare training data features and live features. If the deviation is
93
+ more than 2 standard deviations, flag data drift.
94
+ Numeric features and catagorical features are dealt with separately.
95
+ """
96
+ sample_df = get_data()
97
+ data_drift_status = {}
98
+
99
+ numeric_features = [
100
+ 'Air temperature [K]',
101
+ 'Process temperature [K]',
102
+ 'Rotational speed [rpm]',
103
+ 'Torque [Nm]',
104
+ 'Tool wear [min]'
105
+ ]
106
+
107
+ categorical_features = ['Type']
108
+
109
+ # Numeric features
110
+
111
+ for feature in numeric_features:
112
+ mean_feature_training_data = Xtrain[feature].mean()
113
+ std_feature_training_data = Xtrain[feature].std()
114
+
115
+ mean_feature_sample_logs = sample_df[feature].mean()
116
+
117
+ mean_diff = abs(mean_feature_training_data - mean_feature_sample_logs)
118
+
119
+ if mean_diff > 2 * std_feature_training_data:
120
+ data_drift_status[feature] = "Data Drift Detected! Check Logs!"
121
+ else:
122
+ data_drift_status[feature] = "No Data Drift!"
123
+
124
+ # Categorical feature Type
125
+
126
+ live_proportions = sample_df['Type'].value_counts(normalize=True).values
127
+ training_proportions = Xtrain['Type'].value_counts(normalize=True).values
128
+
129
+ psi_value = psi(live_proportions, training_proportions)
130
+
131
+ if psi_value > 0.1:
132
+ data_drift_status['Type'] = "Data Drift Detected! Check Logs!"
133
+ else:
134
+ data_drift_status['Type'] = "No Data Drift!"
135
+
136
+ return data_drift_status
137
+
138
+
139
  with gr.Blocks() as demo:
140
  gr.Markdown("# Real-time Monitoring Dashboard")
141
 
 
145
  with gr.Column():
146
  gr.Textbox(check_model_drift, every=5, label="Model Drift Status")
147
 
148
+ gr.Markdown("Data drift detection (every 5 seconds)")
149
+
150
+ with gr.Row():
151
+ with gr.Column():
152
+ gr.DataFrame(check_data_drift, every=5, label="Data Drift Status")
153
+
154
+
155
  demo.queue().launch()