Hope-Liang commited on
Commit
217da35
·
1 Parent(s): 5a056d9
Files changed (3) hide show
  1. app.py +36 -0
  2. preprocessor_pipeline.py +172 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sodapy import Socrata
4
+ import hopsworks
5
+ import joblib
6
+ import xgboost as xgb
7
+
8
+
9
+ st.set_page_config(layout="wide")
10
+ st.title('Latest SF incident category prediction')
11
+
12
+ client = Socrata("data.sfgov.org", "gZmg4iarmENBTk1Vzsb94bnse", username="[email protected]", password="Xw990504")
13
+ results = client.get("wg3w-h783", limit=800000)
14
+ results_df = pd.DataFrame.from_records(results)
15
+
16
+ from preprocessor_pipeline import preprocessing_incident
17
+ results_df_preprocessed = preprocessing_incident(results_df)
18
+ results_df_preprocessed.incident_datetime=pd.to_datetime(results_df_preprocessed.incident_datetime)
19
+ results_df_preprocessed.sort_values(by='incident_datetime', ascending = False, inplace = True)
20
+ results_df_preprocessed=results_df_preprocessed[:100]
21
+
22
+ project = hopsworks.login()
23
+ fs = project.get_feature_store()
24
+ mr = project.get_model_registry()
25
+ model = mr.get_model("incident_modal", version=1)
26
+ model_dir = model.download()
27
+ model = joblib.load(model_dir + "/incident_model.pkl")
28
+
29
+ batch_data = results_df_preprocessed
30
+ batch_data.drop(columns=['incident_datetime','incident_category'], inplace=True)
31
+ y_pred = model.predict(batch_data)
32
+
33
+ df = results_df_preprocessed
34
+
35
+ st.write(df)
36
+ st.button("Re-run")
preprocessor_pipeline.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import OneHotEncoder
4
+ from sklearn.compose import ColumnTransformer
5
+ from sklearn.impute import SimpleImputer
6
+ from sklearn.pipeline import make_pipeline
7
+ from sklearn.feature_extraction.text import _VectorizerMixin
8
+ from sklearn.feature_selection._base import SelectorMixin
9
+ from sklearn.pipeline import Pipeline
10
+
11
+ def merge_category(x):
12
+ if x == "Human Trafficking (A), Commercial Sex Acts":
13
+ return "Human Trafficking"
14
+ elif x == "Human Trafficking (B), Involuntary Servitude":
15
+ return "Human Trafficking"
16
+ elif x == "Human Trafficking, Commercial Sex Acts":
17
+ return "Human Trafficking"
18
+ elif x == "Weapons Offence":
19
+ return "Weapons Offense"
20
+ elif x == "Drug Violation":
21
+ return "Drug Offense"
22
+ elif x == "Motor Vehicle Theft?":
23
+ return "Motor Vehicle Theft"
24
+ elif x == "Suspicious Occ":
25
+ return "Suspicious"
26
+ elif x == "Rape":
27
+ return "Sex Offense"
28
+ else:
29
+ return x
30
+
31
+ def merge_category_2(x):
32
+ if x == "Gambling":
33
+ return "Other"
34
+ elif x == "Homicide":
35
+ return "Other"
36
+ elif x == "Human Trafficking":
37
+ return "Other"
38
+ elif x == "Liquor Laws":
39
+ return "Other"
40
+ elif x == "Other Miscellaneous":
41
+ return "Other"
42
+ elif x == "Weapons Carrying Etc":
43
+ return "Weapons Offense"
44
+ elif x == "Offences Against The Family And Children":
45
+ return "Other Offenses"
46
+ elif x == "Sex Offense":
47
+ return "Other Offenses"
48
+ elif x == "Prostitution":
49
+ return "Other"
50
+ elif x == "Case Closure":
51
+ return "Other"
52
+ elif x == "Courtesy Report":
53
+ return "Other"
54
+ elif x == "Fire Report":
55
+ return "Other"
56
+ elif x == "Suicide":
57
+ return "Other"
58
+ elif x == "Embezzlement":
59
+ return "Financial Offense"
60
+ elif x == "Forgery And Counterfeiting":
61
+ return "Financial Offense"
62
+ elif x == "Fraud":
63
+ return "Financial Offense"
64
+ elif x == "Lost Property":
65
+ return "Financial Offense"
66
+ elif x == "Stolen Property":
67
+ return "Financial Offense"
68
+ elif x == "Motor Vehicle Theft":
69
+ return "Traffic and Vehicle Offense"
70
+ elif x == "Recovered Vehicle":
71
+ return "Traffic and Vehicle Offense"
72
+ elif x == "Traffic Collision":
73
+ return "Traffic and Vehicle Offense"
74
+ elif x == "Traffic Violation Arrest":
75
+ return "Traffic and Vehicle Offense"
76
+ elif x == "Vehicle Impounded":
77
+ return "Traffic and Vehicle Offense"
78
+ elif x == "Vehicle Misplaced":
79
+ return "Traffic and Vehicle Offense"
80
+ elif x == "Civil Sidewalks":
81
+ return "Traffic and Vehicle Offense"
82
+ elif x == "Burglary":
83
+ return "Theft and Robbery"
84
+ elif x == "Larceny Theft":
85
+ return "Theft and Robbery"
86
+ elif x == "Robbery":
87
+ return "Theft and Robbery"
88
+ elif x == "Arson":
89
+ return "Assault"
90
+ elif x == "Disorderly Conduct":
91
+ return "Other Offenses"
92
+ elif x == "Vandalism":
93
+ return "Malicious Mischief"
94
+ elif x == "Miscellaneous Investigation":
95
+ return "Suspicious"
96
+ else:
97
+ return x
98
+
99
+ def get_feature_out(estimator, feature_in):
100
+ if hasattr(estimator, 'get_feature_names'):
101
+ if isinstance(estimator, _VectorizerMixin):
102
+ # handling all vectorizers
103
+ return [f'vec_{f}' \
104
+ for f in estimator.get_feature_names()]
105
+ else:
106
+ return estimator.get_feature_names(feature_in)
107
+ elif isinstance(estimator, SelectorMixin):
108
+ return np.array(feature_in)[estimator.get_support()]
109
+ else:
110
+ return feature_in
111
+
112
+
113
+ def get_ct_feature_names(ct):
114
+ # handles all estimators, pipelines inside ColumnTransfomer
115
+ # doesn't work when remainder =='passthrough'
116
+ # which requires the input column names.
117
+ output_features = []
118
+
119
+ for name, estimator, features in ct.transformers_:
120
+ if name != 'remainder':
121
+ if isinstance(estimator, Pipeline):
122
+ current_features = features
123
+ for step in estimator:
124
+ current_features = get_feature_out(step, current_features)
125
+ features_out = current_features
126
+ else:
127
+ features_out = get_feature_out(estimator, features)
128
+ output_features.extend(features_out)
129
+ elif estimator == 'passthrough':
130
+ output_features.extend(ct._feature_names_in[features])
131
+
132
+ return output_features
133
+
134
+ def preprocessing_incident(incident_df):
135
+ # step 1: dropping irrelavent columns and null values
136
+ incident_df.drop(columns=['incident_date','incident_time','incident_year','report_datetime','row_id','incident_id','incident_number',
137
+ 'report_type_description','filed_online','incident_code','incident_subcategory',
138
+ 'incident_description','resolution','cad_number','intersection','cnn','analysis_neighborhood',
139
+ 'supervisor_district','point',':@computed_region_jwn9_ihcz',':@computed_region_26cr_cadq',
140
+ ':@computed_region_qgnn_b9vv',':@computed_region_nqbw_i6c3',':@computed_region_h4ep_8xdi',
141
+ ':@computed_region_n4xg_c4py',':@computed_region_jg9y_a9du'], inplace=True)
142
+ incident_df.dropna(inplace=True)
143
+
144
+ # step 2: create new columns
145
+ incident_df['incident_month']=pd.to_datetime(incident_df["incident_datetime"]).dt.month
146
+ incident_df['incident_year']=pd.to_datetime(incident_df["incident_datetime"]).dt.year
147
+ incident_df['incident_hour']=pd.to_datetime(incident_df["incident_datetime"]).dt.hour
148
+ #incident_df['incident_dayofweek']=pd.to_datetime(incident_df["incident_datetime"]).dt.dayofweek
149
+
150
+ # step 3: merging labels
151
+ incident_df['incident_category']=incident_df['incident_category'].apply(merge_category)
152
+ incident_df['incident_category']=incident_df['incident_category'].apply(merge_category_2)
153
+
154
+ # step 4: onehot encoding using column Transformer Settings
155
+
156
+ t = [('ohe-cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['incident_day_of_week', 'report_type_code','police_district']),
157
+ ('do_nothing', SimpleImputer(strategy='most_frequent'), ['incident_datetime', 'incident_category', 'latitude', 'longitude', 'incident_month', 'incident_year', 'incident_hour']),
158
+ ]
159
+ pre_processor = ColumnTransformer(transformers=t, remainder='drop')
160
+ incident_df_processed = pre_processor.fit_transform(X=incident_df)
161
+ # Get column names
162
+ columns = get_ct_feature_names(pre_processor)
163
+ incident_df_processed = pd.DataFrame(incident_df_processed, columns=columns)
164
+
165
+ # step 5: change column types and names
166
+
167
+ numeric_columns = incident_df_processed.columns.drop(['incident_datetime','incident_category'])
168
+ incident_df_processed[numeric_columns] = incident_df_processed[numeric_columns].apply(pd.to_numeric)
169
+ incident_df_processed['incident_datetime'] = incident_df_processed['incident_datetime'].apply(pd.to_datetime)
170
+ incident_df_processed.rename(columns={"police_district_Out of SF": "police_district_OutOfSF"},inplace=True)
171
+
172
+ return incident_df_processed
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ hopsworks
2
+ joblib
3
+ scikit-learn
4
+ sodapy
5
+ pandas
6
+ xgboost