Spaces:

Lrosado
/

dsla_prototype

Runtime error

App Files Files Community

Lrosado commited on Oct 23, 2024

Commit

11cc2ca

verified ·

1 Parent(s): 7578b4d

Upload 3 files

Browse files

Files changed (3) hide show

model.joblib +3 -0
requirements.txt +2 -0
train.py +78 -0

model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eff91d8c9e178b13ce8e11c8b9d00c5ea272bbf333835f8aac831117b68775f8
+size 41359

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ scikit-learn==1.2.2
2	+ numpy==1.26.4

train.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import sklearn
+import joblib
+import pandas as pd
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import train_test_split, RandomizedSearchCV
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report
+data_df = pd.read_csv('scrubbed_itsm_export.csv')
+target = 'Networkdays'
+numerical_features = ['Priority','SLA Breached']
+categorical_features = ['SNOW', 'Assigned to', 'CI','Symptom','Symptom Detail']
+print("Creating data subsets")
+X = data_df.drop('Networkdays',axis=1)
+y = data_df['Networkdays']
+Xtrain, Xtest, ytrain, ytest = train_test_split(
+    X, y,
+    test_size=0.2,
+    random_state=42
+)
+numerical_pipeline = Pipeline([
+    ('imputer', SimpleImputer(strategy='median')),
+    ('scaler', StandardScaler())
+])
+categorical_pipeline = Pipeline([
+    ('imputer', SimpleImputer(strategy='most_frequent')),
+    ('onehot', OneHotEncoder(handle_unknown='ignore'))
+])
+preprocessor = make_column_transformer(
+    (numerical_pipeline, numerical_features),
+    (categorical_pipeline, categorical_features)
+)
+model_logistic_regression = LogisticRegression(n_jobs=-1)
+print("Estimating Best Model Pipeline")
+model_pipeline = make_pipeline(
+    preprocessor,
+    model_logistic_regression
+)
+param_distribution = {
+    "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
+}
+rand_search_cv = RandomizedSearchCV(
+    model_pipeline,
+    param_distribution,
+    n_iter=3,
+    cv=3,
+    random_state=42
+)
+rand_search_cv.fit(Xtrain, ytrain)
+print("Logging Metrics")
+print(f"Accuracy: {rand_search_cv.best_score_}")
+print("Serializing Model")
+saved_model_path = "model.joblib"
+joblib.dump(rand_search_cv.best_estimator_, saved_model_path)