Spaces:

Nathanotal
/

titanic

Runtime error

App Files Files Community

Nathanotal commited on Nov 16, 2022

Commit

b56bc50

1 Parent(s): 316c194

mer

Browse files

Files changed (2) hide show

app.py +120 -20
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -3,10 +3,107 @@ import numpy as np
 from PIL import Image
 import requests
 import pandas as pd
 import hopsworks
 import joblib
 project = hopsworks.login()
 fs = project.get_feature_store()
@@ -23,25 +120,26 @@ df = pd
 # featureLabels = features.columns
 featureLabels = ["Pclass", "Name", "Sex", "Age", "SibSp",
                  "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
-def titanic(Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, Embarked):
-    input_list = []
-    sexToFeature = {
-        "male": 0,
-        "female": 1,
-    }
-    # Convert inputs to features
-    input_list.append(Pclass)  # Todo: Convert to feature
-    input_list.append(sexToFeature.get(Sex))  # Todo: Convert to feature
-    input_list.append(Age)  # !
-    input_list.append(SibSp)  # Todo: Convert to feature
-    input_list.append(Parch)  # Todo: Convert to feature
-    input_list.append(Fare)  # Todo: Convert to feature
-    input_list.append(Cabin)  # Todo: Convert to feature
-    input_list.append(Embarked)  # Todo: Convert to feature
     # 'res' is a list of predictions returned as the label.
     res = model.predict(np.asarray(input_list).reshape(1, -1))
@@ -70,10 +168,12 @@ def titanic(Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, Embarked):
     return img
-inputs = []
-numericalInputs = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
-worthlessInputs = ["Name", "Ticket"]
-categoricalInputs = ["Sex", "Embarked", "Cabin"]
 featureLabels = ["Pclass", "Name", "Sex", "Age", "SibSp",
                  "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
@@ -86,7 +186,7 @@ for feature in featureLabels:
         # inputs.append(gr.Inputs.Textbox(default='text', label=feature))
     elif feature in categoricalInputs:
         inputs.append(gr.inputs.Dropdown(
-            choices=["a", "b"], default="a", label=feature))
     else:
         raise Exception(f'Feature: "{feature}" not found')

 from PIL import Image
 import requests
 import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestRegressor
 import hopsworks
 import joblib
+def initialize_data(df):
+    # df.set_index("PassengerId", inplace=True) # For debugging
+    df.drop("PassengerId", axis=1, inplace=True)
+    # Survived: Label
+    # Pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
+    # Age: Age in years
+    # Name: The name of the passenger
+    # Sex: male/female
+    # SibSp: no. of siblings / spouses aboard the Titanic
+    # Parch: no. of parents / children aboard the Titanic
+    # Ticket: Ticket number
+    # Fare: Passenger fare
+    # Cabin: Cabin number
+    # Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
+    # Three columns have missing values: Age, Cabin, Embarked
+    # Cabin has too many missing values to be useful, so we drop it -
+    # df.drop("Cabin", axis=1, inplace=True)
+    # Embarked has only 2 missing values so we can drop those rows
+    df.dropna(subset=["Embarked"], inplace=True)
+    # Age has 177 missing values, this is a lot, so we train a model to predict the age based on the other features
+    # This model will be a simple linear regression model (see below)
+    for row in df.itertuples():
+        pass
+        # Inspect the name column to extract the title of the passenger
+        # This will be a new feature
+        # name = row.Name
+        # title = name.split(",")[1].split(".")[0].strip()
+        # df.at[row.Index, "Title"] = title
+        # Inspect the name column to extract the surname of the passenger
+        # This will be a new feature
+        # name = row.Name
+        # surname = name.split(",")[0].strip()
+        # df.at[row.Index, "Surname"] = surname
+        # If the passenger has a cabin number, extract the cabin prefix
+        # This will be a new feature
+        # cabin = row.Cabin
+        # if not pd.isnull(cabin):
+        #     if len(cabin.split(" ")) > 1: # Some have multiple cabins => take the first one
+        #         cabin = cabin.split(" ")[0]
+        #         df.at[row.Index, "Cabin"] = 'Multiple Cabin'
+        #     else:
+        #         df.at[row.Index, "Cabin"] = 'Cabin'
+        #     cabin_prefix = cabin[0]
+        #     # df.at[row.Index, "CabinClass"] = cabin_prefix
+        # else:
+        #     # df.at[row.Index, "CabinClass"] = "XXX"
+        #     df.at[row.Index, "Cabin"] = "No Cabin"
+    # Now we can drop the Name column
+    df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
+    return df
+def prepare_for_write(df):
+    # Convert the categorical features to numerical
+    def sexToInt(x):
+        if x == "male":
+            return 0
+        elif x == "female":
+            return 1
+        else:
+            raise Exception("Unsupported sex value: " + x)
+    def embarkedToInt(x):
+        if x == "S":
+            return 0
+        elif x == "C":
+            return 1
+        elif x == "Q":
+            return 2
+        else:
+            raise Exception("Unsupported embarked value: " + x)
+    df["Sex"] = df["Sex"].apply(sexToInt)
+    df["Embarked"] = df["Embarked"].apply(embarkedToInt)
+    # le = preprocessing.LabelEncoder()
+    # df = df.apply(le.fit_transform)
+    df.columns = df.columns.str.lower()
+    return df
 project = hopsworks.login()
 fs = project.get_feature_store()
 # featureLabels = features.columns
 featureLabels = ["Pclass", "Name", "Sex", "Age", "SibSp",
                  "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
+inputs = []
+numericalInputs = ["Age", "SibSp", "Parch", "Fare"]
+# Maybe move cabin to categorical
+worthlessInputs = ["Name", "Ticket", "Cabin"]
+categoricalInputs = ["Sex", "Embarked", "Pclass"]
+columnHeaders = ["Pclass", "Sex", "Age", "SibSp",
+                 "Parch", "Fare", "Embarked"]
+def titanic(Pclass, Sex, Age, SibSp, Parch, Fare, Embarked):
+    # Create a dataframe from the input values
+    input_variables = pd.DataFrame(
+        [[Pclass, Sex, Age, SibSp, Parch, Fare, Embarked]], columns=columnHeaders)
+    df = initialize_data(input_variables)
+    df = prepare_for_write(df)
+    # Save first row as a numpy array
+    input_list = df.iloc[0].to_numpy()
     # 'res' is a list of predictions returned as the label.
     res = model.predict(np.asarray(input_list).reshape(1, -1))
     return img
+catToInput = {
+    "Sex": ["male", "female"],
+    "Embarked": ["S", "C", "Q"],
+    "Pclass": [0, 1, 2]
+}
 featureLabels = ["Pclass", "Name", "Sex", "Age", "SibSp",
                  "Parch", "Ticket", "Fare", "Cabin", "Embarked"]
         # inputs.append(gr.Inputs.Textbox(default='text', label=feature))
     elif feature in categoricalInputs:
         inputs.append(gr.inputs.Dropdown(
+            choices=catToInput.get(feature), default="a", label=feature))
     else:
         raise Exception(f'Feature: "{feature}" not found')

requirements.txt CHANGED Viewed

@@ -2,3 +2,6 @@ hopsworks
 joblib
 scikit-learn
 gradio

 joblib
 scikit-learn
 gradio
+numpy
+pandas
+requests