Spaces:
Runtime error
Runtime error
File size: 4,196 Bytes
d503d85 7ff76a7 d503d85 3137083 423c800 d503d85 423c800 d503d85 423c800 3137083 423c800 d503d85 423c800 634f358 d503d85 3137083 d503d85 0c5aeb8 3137083 0c5aeb8 7ff76a7 0c5aeb8 7ff76a7 d503d85 7ff76a7 3137083 7ff76a7 da9bd9a 7ff76a7 3137083 423c800 3137083 18c45ff 3137083 18c45ff 3137083 18c45ff 3137083 18c45ff 3137083 fe607dc 3137083 423c800 d503d85 da9bd9a d503d85 423c800 bbfc212 423c800 3137083 423c800 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from datasets import load_dataset
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
LOGS_DATASET_URI = 'pgurazada1/machine-failure-mlops-demo-logs'
# Load and cache training data
dataset = fetch_openml(data_id=42890, as_frame=True, parser="auto")
data_df = dataset.data
target = 'Machine failure'
numeric_features = [
'Air temperature [K]',
'Process temperature [K]',
'Rotational speed [rpm]',
'Torque [Nm]',
'Tool wear [min]'
]
categorical_features = ['Type']
X = data_df[numeric_features + categorical_features]
y = data_df[target]
Xtrain, Xtest, ytrain, ytest = train_test_split(
X, y,
test_size=0.2,
random_state=42
)
def get_data():
"""
Connect to the HuggingFace dataset where the logs are stored.
Pull the data into a dataframe
"""
data = load_dataset(LOGS_DATASET_URI)
sample_df = data['train'].to_pandas().sample(100)
return sample_df
def check_model_drift():
"""
Check proportion of machine failure as compared to
its proportion in training data. If the deviation is more than
2 standard deviations, flag a model drift.
"""
sample_df = get_data()
p_pos_label_training_data = 0.03475
training_data_size = 8000
n_0 = sample_df.prediction.value_counts()[0]
try:
n_1 = sample_df.prediction.value_counts()[1]
except Exception as e:
n_1 = 0
p_pos_label_sample_logs = n_1/(n_0+n_1)
variance = (p_pos_label_training_data * (1-p_pos_label_training_data))/training_data_size
p_diff = abs(p_pos_label_training_data - p_pos_label_sample_logs)
if p_diff > 2 * math.sqrt(variance):
return "Model Drift Detected! Check Logs!"
else:
return "No Model Drift!"
def psi(actual_proportions, expected_proportions):
psi_values = (actual_proportions - expected_proportions) * \
np.log(actual_proportions / expected_proportions)
return sum(psi_values)
def check_data_drift():
"""
Compare training data features and live features. If the deviation is
more than 2 standard deviations, flag data drift.
Numeric features and catagorical features are dealt with separately.
"""
sample_df = get_data()
data_drift_status = {}
numeric_features = [
'Air temperature [K]',
'Process temperature [K]',
'Rotational speed [rpm]',
'Torque [Nm]',
'Tool wear [min]'
]
categorical_features = ['Type']
# Numeric features
for feature in numeric_features:
mean_feature_training_data = Xtrain[feature].mean()
std_feature_training_data = Xtrain[feature].std()
mean_feature_sample_logs = sample_df[feature].mean()
mean_diff = abs(mean_feature_training_data - mean_feature_sample_logs)
if mean_diff > 2 * std_feature_training_data:
data_drift_status[feature] = ["Data Drift Detected! Check Logs!"]
else:
data_drift_status[feature] = ["No Data Drift!"]
# Categorical feature Type
live_proportions = sample_df['Type'].value_counts(normalize=True).values
training_proportions = Xtrain['Type'].value_counts(normalize=True).values
psi_value = psi(live_proportions, training_proportions)
if psi_value > 0.1:
data_drift_status['Type'] = ["Data Drift Detected! Check Logs!"]
else:
data_drift_status['Type'] = ["No Data Drift!"]
return pd.DataFrame.from_dict(data_drift_status)
with gr.Blocks() as demo:
gr.Markdown("# Real-time Monitoring Dashboard")
gr.Markdown("Model drift detection (every 5 seconds)")
with gr.Row():
with gr.Column():
gr.Textbox(check_model_drift, every=5, label="Model Drift Status")
gr.Markdown("Data drift detection (every 5 seconds)")
with gr.Row():
with gr.Column():
gr.DataFrame(check_data_drift, every=5, label="Data Drift Status")
demo.queue().launch() |