|
import seaborn as sns |
|
import pandas as pd |
|
import os |
|
from sklearn.linear_model import LogisticRegression |
|
|
|
def create_new_dataframe(data, column_names): |
|
new_data = {} |
|
|
|
for column in column_names: |
|
if column in data.columns: |
|
new_data[column] = data[column] |
|
else: |
|
new_data[column] = pd.Series(0, index=data.index) |
|
|
|
new_dataframe = pd.DataFrame(new_data) |
|
return new_dataframe |
|
|
|
|
|
df = pd.read_csv("train.csv") |
|
num_examples = df.shape[0] |
|
df = df.sample(frac = 1, random_state=1) |
|
train_data = df[0:int(0.8*num_examples)] |
|
val_data = df[int(0.8*num_examples)+1:] |
|
|
|
train_data[["Deck", "Cabin_num", "Side"]] = train_data["Cabin"].str.split("/", expand=True) |
|
train_data = train_data.drop('Cabin', axis=1) |
|
|
|
val_data[["Deck", "Cabin_num", "Side"]] = val_data["Cabin"].str.split("/", expand=True) |
|
val_data = val_data.drop('Cabin', axis=1) |
|
|
|
TargetY = train_data["Transported"] |
|
TargetY_test = val_data["Transported"] |
|
|
|
|
|
|
|
selectColumns = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"] |
|
ResourceX = pd.get_dummies(train_data[selectColumns]) |
|
ResourceX_test = pd.get_dummies(val_data[selectColumns]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Train Accuracy: {train_accuracy}") |
|
print(f"Validation Accuracy: {val_accuracy}") |
|
|
|
test_data = pd.read_csv('test.csv') |
|
test_data[["Deck", "Cabin_num", "Side"]] = test_data["Cabin"].str.split("/", expand=True) |
|
test_data = test_data.drop('Cabin', axis=1) |
|
|
|
test_X = pd.get_dummies(test_data[selectColumns]) |
|
test_X.insert(loc = 17, |
|
column = 'Deck_T', |
|
value = 0) |
|
|
|
test_preds = model.predict(test_X) |
|
|
|
|
|
output = pd.DataFrame({'PassengerId': test_data.PassengerId, |
|
'Transported': test_preds}) |
|
output.to_csv('submission.csv', index=False) |