Spaces:

HopeLiang
/

hugging-face-project

Runtime error

App Files Files Community

Hope-Liang commited on Jan 9, 2023

Commit

217da35

1 Parent(s): 5a056d9

update

Browse files

Files changed (3) hide show

app.py +36 -0
preprocessor_pipeline.py +172 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import streamlit as st
+import pandas as pd
+from sodapy import Socrata
+import hopsworks
+import joblib
+import xgboost as xgb
+st.set_page_config(layout="wide")
+st.title('Latest SF incident category prediction')
+client = Socrata("data.sfgov.org", "gZmg4iarmENBTk1Vzsb94bnse", username="[email protected]", password="Xw990504")
+results = client.get("wg3w-h783", limit=800000)
+results_df = pd.DataFrame.from_records(results)
+from preprocessor_pipeline import preprocessing_incident
+results_df_preprocessed = preprocessing_incident(results_df)
+results_df_preprocessed.incident_datetime=pd.to_datetime(results_df_preprocessed.incident_datetime)
+results_df_preprocessed.sort_values(by='incident_datetime', ascending = False, inplace = True)
+results_df_preprocessed=results_df_preprocessed[:100]
+project = hopsworks.login()
+fs = project.get_feature_store()
+mr = project.get_model_registry()
+model = mr.get_model("incident_modal", version=1)
+model_dir = model.download()
+model = joblib.load(model_dir + "/incident_model.pkl")
+batch_data = results_df_preprocessed
+batch_data.drop(columns=['incident_datetime','incident_category'], inplace=True)
+y_pred = model.predict(batch_data)
+df = results_df_preprocessed
+st.write(df)
+st.button("Re-run")

preprocessor_pipeline.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import make_pipeline
+from sklearn.feature_extraction.text import _VectorizerMixin
+from sklearn.feature_selection._base import SelectorMixin
+from sklearn.pipeline import Pipeline
+def merge_category(x):
+    if x == "Human Trafficking (A), Commercial Sex Acts":
+        return "Human Trafficking"
+    elif x == "Human Trafficking (B), Involuntary Servitude":
+        return "Human Trafficking"
+    elif x == "Human Trafficking, Commercial Sex Acts":
+        return "Human Trafficking"
+    elif x == "Weapons Offence":
+        return "Weapons Offense"
+    elif x == "Drug Violation":
+        return "Drug Offense"
+    elif x == "Motor Vehicle Theft?":
+        return "Motor Vehicle Theft"
+    elif x == "Suspicious Occ":
+        return "Suspicious"
+    elif x == "Rape":
+        return "Sex Offense"
+    else:
+        return x
+def merge_category_2(x):
+    if x == "Gambling":
+        return "Other"
+    elif x == "Homicide":
+        return "Other"
+    elif x == "Human Trafficking":
+        return "Other"
+    elif x == "Liquor Laws":
+        return "Other"
+    elif x == "Other Miscellaneous":
+        return "Other"
+    elif x == "Weapons Carrying Etc":
+        return "Weapons Offense"
+    elif x == "Offences Against The Family And Children":
+        return "Other Offenses"
+    elif x == "Sex Offense":
+        return "Other Offenses"
+    elif x == "Prostitution":
+        return "Other"
+    elif x == "Case Closure":
+        return "Other"
+    elif x == "Courtesy Report":
+        return "Other"
+    elif x == "Fire Report":
+        return "Other"
+    elif x == "Suicide":
+        return "Other"
+    elif x == "Embezzlement":
+        return "Financial Offense"
+    elif x == "Forgery And Counterfeiting":
+        return "Financial Offense"
+    elif x == "Fraud":
+        return "Financial Offense"
+    elif x == "Lost Property":
+        return "Financial Offense"
+    elif x == "Stolen Property":
+        return "Financial Offense"
+    elif x == "Motor Vehicle Theft":
+        return "Traffic and Vehicle Offense"
+    elif x == "Recovered Vehicle":
+        return "Traffic and Vehicle Offense"
+    elif x == "Traffic Collision":
+        return "Traffic and Vehicle Offense"
+    elif x == "Traffic Violation Arrest":
+        return "Traffic and Vehicle Offense"
+    elif x == "Vehicle Impounded":
+        return "Traffic and Vehicle Offense"
+    elif x == "Vehicle Misplaced":
+        return "Traffic and Vehicle Offense"
+    elif x == "Civil Sidewalks":
+        return "Traffic and Vehicle Offense"
+    elif x == "Burglary":
+        return "Theft and Robbery"
+    elif x == "Larceny Theft":
+        return "Theft and Robbery"
+    elif x == "Robbery":
+        return "Theft and Robbery"
+    elif x == "Arson":
+        return "Assault"
+    elif x == "Disorderly Conduct":
+        return "Other Offenses"
+    elif x == "Vandalism":
+        return "Malicious Mischief"
+    elif x == "Miscellaneous Investigation":
+        return "Suspicious"
+    else:
+        return x
+def get_feature_out(estimator, feature_in):
+    if hasattr(estimator, 'get_feature_names'):
+        if isinstance(estimator, _VectorizerMixin):
+            # handling all vectorizers
+            return [f'vec_{f}' \
+                    for f in estimator.get_feature_names()]
+        else:
+            return estimator.get_feature_names(feature_in)
+    elif isinstance(estimator, SelectorMixin):
+        return np.array(feature_in)[estimator.get_support()]
+    else:
+        return feature_in
+def get_ct_feature_names(ct):
+    # handles all estimators, pipelines inside ColumnTransfomer
+    # doesn't work when remainder =='passthrough'
+    # which requires the input column names.
+    output_features = []
+    for name, estimator, features in ct.transformers_:
+        if name != 'remainder':
+            if isinstance(estimator, Pipeline):
+                current_features = features
+                for step in estimator:
+                    current_features = get_feature_out(step, current_features)
+                features_out = current_features
+            else:
+                features_out = get_feature_out(estimator, features)
+            output_features.extend(features_out)
+        elif estimator == 'passthrough':
+            output_features.extend(ct._feature_names_in[features])
+    return output_features
+def preprocessing_incident(incident_df):
+    # step 1: dropping irrelavent columns and null values
+    incident_df.drop(columns=['incident_date','incident_time','incident_year','report_datetime','row_id','incident_id','incident_number',
+                         'report_type_description','filed_online','incident_code','incident_subcategory',
+                         'incident_description','resolution','cad_number','intersection','cnn','analysis_neighborhood',
+                         'supervisor_district','point',':@computed_region_jwn9_ihcz',':@computed_region_26cr_cadq',
+                         ':@computed_region_qgnn_b9vv',':@computed_region_nqbw_i6c3',':@computed_region_h4ep_8xdi',
+                         ':@computed_region_n4xg_c4py',':@computed_region_jg9y_a9du'], inplace=True)
+    incident_df.dropna(inplace=True)
+    # step 2: create new columns
+    incident_df['incident_month']=pd.to_datetime(incident_df["incident_datetime"]).dt.month
+    incident_df['incident_year']=pd.to_datetime(incident_df["incident_datetime"]).dt.year
+    incident_df['incident_hour']=pd.to_datetime(incident_df["incident_datetime"]).dt.hour
+    #incident_df['incident_dayofweek']=pd.to_datetime(incident_df["incident_datetime"]).dt.dayofweek
+    # step 3: merging labels
+    incident_df['incident_category']=incident_df['incident_category'].apply(merge_category)
+    incident_df['incident_category']=incident_df['incident_category'].apply(merge_category_2)
+    # step 4: onehot encoding using column Transformer Settings
+    t = [('ohe-cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['incident_day_of_week', 'report_type_code','police_district']),
+         ('do_nothing', SimpleImputer(strategy='most_frequent'), ['incident_datetime', 'incident_category', 'latitude', 'longitude', 'incident_month', 'incident_year', 'incident_hour']),
+         ]
+    pre_processor = ColumnTransformer(transformers=t, remainder='drop')
+    incident_df_processed = pre_processor.fit_transform(X=incident_df)
+    # Get column names
+    columns = get_ct_feature_names(pre_processor)
+    incident_df_processed = pd.DataFrame(incident_df_processed, columns=columns)
+    # step 5: change column types and names
+    numeric_columns = incident_df_processed.columns.drop(['incident_datetime','incident_category'])
+    incident_df_processed[numeric_columns] = incident_df_processed[numeric_columns].apply(pd.to_numeric)
+    incident_df_processed['incident_datetime'] = incident_df_processed['incident_datetime'].apply(pd.to_datetime)
+    incident_df_processed.rename(columns={"police_district_Out of SF": "police_district_OutOfSF"},inplace=True)
+    return incident_df_processed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+hopsworks
+joblib
+scikit-learn
+sodapy
+pandas
+xgboost