house-pricing-v1 / utils.py
RMHalak's picture
Update utils.py
344f42c verified
# create new features
def create_new_features(df):
df['year_sold'] = df['date'].dt.year
df = df.drop(columns=['date'])
df['house_age'] = df['year_sold'] - df['yr_built']
df['years_since_renovation'] = df['year_sold'] - df['yr_renovated']
df.drop(columns=['year_sold'], inplace=True)
df['has_basement'] = df['sqft_basement'].apply(lambda x: 1 if x > 0 else 0)
mask = df['yr_renovated'] == 0
df.loc[mask, 'yr_renovated'] = df.loc[mask, 'yr_built']
return df
def normalize(df):
import json
with open("./min_dict.json", "r") as f:
min_dict = json.load(f)
with open("./max_dict.json", "r") as f:
max_dict = json.load(f)
numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
'view', 'condition', 'sqft_above', 'sqft_basement',
'yr_built', 'yr_renovated', 'house_age', 'years_since_renovation']
for col in numerical_features:
df[col] = df[col].apply(lambda x: (x-min_dict[col])/(max_dict[col]-min_dict[col]))
return df
def bucketize(df):
bucket_sizes = {'sqft_living': 25,
'sqft_lot': 25,
'sqft_above': 25,
'sqft_basement': 25}
for col, size in bucket_sizes.items():
df[col] = df[col].apply(lambda x: (x // size)*size)
return df
def init_new_pred():
import pandas as pd
columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
'yr_built', 'yr_renovated', 'house_age', 'years_since_renovation',
'has_basement', 'city_Algona', 'city_Auburn', 'city_Beaux Arts Village',
'city_Bellevue', 'city_Black Diamond', 'city_Bothell', 'city_Burien',
'city_Carnation', 'city_Clyde Hill', 'city_Covington',
'city_Des Moines', 'city_Duvall', 'city_Enumclaw', 'city_Fall City',
'city_Federal Way', 'city_Inglewood-Finn Hill', 'city_Issaquah',
'city_Kenmore', 'city_Kent', 'city_Kirkland', 'city_Lake Forest Park',
'city_Maple Valley', 'city_Medina', 'city_Mercer Island', 'city_Milton',
'city_Newcastle', 'city_Normandy Park', 'city_North Bend',
'city_Pacific', 'city_Preston', 'city_Ravensdale', 'city_Redmond',
'city_Renton', 'city_Sammamish', 'city_SeaTac', 'city_Seattle',
'city_Shoreline', 'city_Skykomish', 'city_Snoqualmie',
'city_Snoqualmie Pass', 'city_Tukwila', 'city_Vashon',
'city_Woodinville', 'city_Yarrow Point']
new_pred = {key:0 for key in columns}
new_pred['date'] = pd.to_datetime('2014-07-10') # do not change
return new_pred