Lrosado commited on
Commit
11cc2ca
·
verified ·
1 Parent(s): 7578b4d

Upload 3 files

Browse files
Files changed (3) hide show
  1. model.joblib +3 -0
  2. requirements.txt +2 -0
  3. train.py +78 -0
model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff91d8c9e178b13ce8e11c8b9d00c5ea272bbf333835f8aac831117b68775f8
3
+ size 41359
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scikit-learn==1.2.2
2
+ numpy==1.26.4
train.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sklearn
2
+ import joblib
3
+ import pandas as pd
4
+
5
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
6
+ from sklearn.compose import make_column_transformer
7
+ from sklearn.impute import SimpleImputer
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.pipeline import make_pipeline
10
+
11
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
12
+
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import accuracy_score, classification_report
15
+
16
+ data_df = pd.read_csv('scrubbed_itsm_export.csv')
17
+
18
+ target = 'Networkdays'
19
+ numerical_features = ['Priority','SLA Breached']
20
+ categorical_features = ['SNOW', 'Assigned to', 'CI','Symptom','Symptom Detail']
21
+
22
+ print("Creating data subsets")
23
+
24
+ X = data_df.drop('Networkdays',axis=1)
25
+ y = data_df['Networkdays']
26
+
27
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
28
+ X, y,
29
+ test_size=0.2,
30
+ random_state=42
31
+ )
32
+
33
+ numerical_pipeline = Pipeline([
34
+ ('imputer', SimpleImputer(strategy='median')),
35
+ ('scaler', StandardScaler())
36
+ ])
37
+
38
+ categorical_pipeline = Pipeline([
39
+ ('imputer', SimpleImputer(strategy='most_frequent')),
40
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
41
+ ])
42
+
43
+ preprocessor = make_column_transformer(
44
+ (numerical_pipeline, numerical_features),
45
+ (categorical_pipeline, categorical_features)
46
+ )
47
+
48
+ model_logistic_regression = LogisticRegression(n_jobs=-1)
49
+
50
+ print("Estimating Best Model Pipeline")
51
+
52
+ model_pipeline = make_pipeline(
53
+ preprocessor,
54
+ model_logistic_regression
55
+ )
56
+
57
+ param_distribution = {
58
+ "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
59
+ }
60
+
61
+ rand_search_cv = RandomizedSearchCV(
62
+ model_pipeline,
63
+ param_distribution,
64
+ n_iter=3,
65
+ cv=3,
66
+ random_state=42
67
+ )
68
+
69
+ rand_search_cv.fit(Xtrain, ytrain)
70
+
71
+ print("Logging Metrics")
72
+ print(f"Accuracy: {rand_search_cv.best_score_}")
73
+
74
+ print("Serializing Model")
75
+
76
+ saved_model_path = "model.joblib"
77
+
78
+ joblib.dump(rand_search_cv.best_estimator_, saved_model_path)