Spaces:

Nathanotal
/

stockholmHousingValuation

Runtime error

File size: 14,822 Bytes

import gradio as gr
import numpy as np
from PIL import Image
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
import hopsworks
from tqdm import tqdm
import xgboost as xgb
from geopy.geocoders import Nominatim
from datetime import date
from datetime import timedelta
from autogluon.tabular import TabularPredictor
import shutil

# Login to hopsworks and get the feature store

# streetName;number;sqm;rooms;soldDate;monthlyFee;monthlyCost;floor;yearBuilt;brf;agency;lat;lon;gdp;unemployment;interestRate
columnHeaders = ['streetName','number','sqm','rooms','soldDate','monthlyFee','monthlyCost','floor','yearBuilt', 'brf','agency','lat','lon'] # ,'gdp','unemployment','interestRate'

featureToMinMax = {
        'sqm': (10, 800),
        'rooms': (1, 20),
        'monthlyFee': (0, 60000),
        'monthlyCost': (0, 20000),
        'floor': (-3, 35),
        'yearBuilt': (1850, 2023),
        'lat': (58.8, 60.2),
        'lon': (17.5, 19.1),
        'gdp': (505.1, 630.14),
        'unemployment': (6.36, 8.66),
        'interestRate': (-0.5, 2.64),
        'number': (0, 300),
        'soldDate': (2010, 2025)
    } # Extracted from the data

featureToName = {
    'number' : 'Street number',
     'sqm' : 'Size of the apartment in square meters',
    'rooms' : 'Number of rooms',
     'monthlyFee' : 'Monthly fee',
    'monthlyCost' : 'Monthly operating cost',
    'floor' : 'Floor',
    'yearBuilt' : 'Year built',
     'streetName' : 'Name of street',
}

topAgencies = ['Fastighetsbyrån','Notar','Svensk Fastighetsförmedling','HusmanHagberg','Länsförsäkringar Fastighetsförmedling','Erik Olsson','SkandiaMäklarna','Svenska Mäklarhuset','Bjurfors','Mäklarhuset','BOSTHLM','Innerstadsspecialisten','MOHV','Mäklarringen','Historiska Hem','Södermäklarna','Karlsson & Uddare','UNIK Fastighetsförmedling','Edward & Partners','Widerlöv']

def downloadAutogluonModel():
    # Download saved Autogluon model from Hopsworks
    project = hopsworks.login() 
    mr = project.get_model_registry()
    temp = mr.get_model("ag_model_20230109", version=5)
    temp_ag_folder_path = temp.download()
    print(temp_ag_folder_path)
    moveFolder(temp_ag_folder_path)

    ag_model = TabularPredictor.load("AutogluonModels/ag_model_20230109") # '/ag_model_20230109'

    return ag_model


def moveFolder(temp_ag_folder_path):
    # Move Autogluon model folder to the correct folder
    original = temp_ag_folder_path
    target = "AutogluonModels/"
    shutil.move(original, target)

def downloadModel():
    # Download saved Autogluon model from Hopsworks 
    project = hopsworks.login() 
    mr = project.get_model_registry()
    temp = mr.get_model("xgboost_model", version=5)
    model_path = temp.download()

    xgb_model = joblib.load(model_path + "/xgboost_model.pkl")
    return xgb_model

def getAddressInfo(streetName, number):
    streetName = cleanAddress(streetName)
    try:
        return getCoordinatesFromAddress(streetName, number)
    except AddressNotFound:
        return None, None

# Adds the financial data to the apartment data
def populateApartmentData(aptDf):
    print('Populating with financial data...')
    gdpDf = pd.read_csv(f'./data/historicalGDP.csv', sep=';')
    unemploymentDf = pd.read_csv(f'./data/historicalUnemployment.csv', sep=';')
    interestRateDf = pd.read_csv(f'./data/historicalInterest.csv', sep=';')
    gdpDf = interpolateTime(gdpDf)
    unemploymentDf = interpolateTime(unemploymentDf)
    interestRateDf = interpolateTime(interestRateDf)
    aptDf['gdp'] = aptDf['soldDate'].apply(getValueFromTime, args=(gdpDf,))
    aptDf['unemployment'] = aptDf['soldDate'].apply(getValueFromTime, args=(unemploymentDf,))
    aptDf['interestRate'] = aptDf['soldDate'].apply(getValueFromTime, args=(interestRateDf,))
    return aptDf
    
def interpolateTime(df):
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    df = df.resample('MS').mean()
    df = df.interpolate(method='time')
    return fixChange(df)

def getValueFromTime(datetime, dataDf):
    # Get the value from the dataDf at the given datetime
    # If the datetime is not in the dataDf, print the datetime and return '0'
    # First, set the day of the datetime to the first day of the month
    # parse datetime to enable replacement
    datetime = pd.to_datetime(datetime)
    datetime = datetime.replace(day=1)
    try:
        return dataDf.loc[datetime, 'value']
    except KeyError:
        # Try adding one month
        nextMonth = datetime.month + 1
        if nextMonth > 12:
            datetime = datetime.replace(month=1)
            datetime = datetime.replace(year=datetime.year + 1)

def fixChange(df):
    # Set change to be the difference between the current and previous price
    df['change'] = df['value'].diff()
    # If the change is Nan set it to 0
    df['change'] = df['change'].fillna(0)
    
    return df

def cleanAddress(x):
    # Remove "-" from the street
    x = ''.join(x.split('-'))
    # Remove all zero width spaces, non-breaking spaces and non-breaking hyphens
    x = x.replace('\u200b', '')
    x = x.replace('\u00a0', '')
    x = x.replace('\u2011', '')
    # Remove all soft hyphens
    x = x.replace('\xad', '')
    x = x.replace('\u200c', '')

    x.strip()
    return x

class AddressNotFound(Exception):
    pass

def getCoordinatesFromAddress(streetName, number):

    HOST_ADDRESS = '165.227.162.37'
    HOST_PORT = '8080'
    EMAIL = '[email protected]'
    DOMAIN = HOST_ADDRESS + ':' + HOST_PORT
    LOCATOR = Nominatim(user_agent=EMAIL, domain=DOMAIN, scheme='http', timeout=10)

    number = str(int(float(number)))
    address = f'{streetName} {number}, Stockholm'
    
    if number == '0':
        address = f'{streetName}, Stockholm'
        
    location = LOCATOR.geocode(address)
    
    if location is None:
        raise AddressNotFound
    else:
        # Return with a precision of 6 decimals (accuracy of <1 meter)
        lat = round(location.latitude, 6)
        lon = round(location.longitude, 6)
        return lat, lon

def dateToFloat(date):
    year, month, day = str(date).split('-')
    day = day.split(' ')[0]
    return int(year) + int(month) / 12 + int(day) / 365

def normalize(x, minVal, maxVal, feature):
    # Not fantastic
    res = (float(x) - minVal) / (maxVal - minVal)
    return min(max(res, 0), 1)

def normalizeData(df):
    # Normalize select numerical values to a value between 0 and 1
    print('Normalizing data...')
    for feature, minMax in tqdm(featureToMinMax.items()):
        min = minMax[0]
        max = minMax[1]
        if feature == 'soldDate':
            df[feature] = df[feature].apply(lambda x: dateToFloat(x))

        df[feature] = df[feature].apply(lambda x: normalize(x, min, max, feature))

    return df

def parsePrice(price):
    featureToMinMaxPrice = {
    'price': (1.5e5, 7e7)
    }
    MIN = featureToMinMaxPrice['price'][0]
    MAX = featureToMinMaxPrice['price'][1]

    price = float(price)
    price = price * (MAX - MIN) + MIN
    return f'{addDotsToPrice(int(price))} SEK'

def addDotsToPrice(price):
    # Takes an int like 1000000 and returns a string like 1.000.000
    toReturn = ''
    price = str(price)
    for i, c in enumerate(price):
        toReturn += c
        if (len(price) - i) % 3 == 1 and i != len(price) - 1 and c != '-':
            toReturn += '.'
    return toReturn
        
        

def xgbFix(df):
    features_to_categorical = ["streetName", "brf", "agency"]

    features_to_float = ["number", "sqm", "rooms", "monthlyFee",
                        "monthlyCost", "floor", "yearBuilt", "gdp", "unemployment",
                        "interestRate", "lat", "lon", "soldDate"]

    df[features_to_categorical] = df[features_to_categorical].astype("category")
    df[features_to_float] = df[features_to_float].astype(float)
    return df


model = downloadModel()
autoModel = downloadAutogluonModel()

def xgboostPred(df):
    # Drop categorical features
    df = df.drop(['streetName', 'brf', 'agency'], axis=1)

    # Save first row as a numpy array

    results = []
    for _,row in df.iterrows():
        input_list = row.to_numpy()
        res = model.predict(np.asarray(input_list).reshape(1, -1))
        results.append(res[0]) # This is not done in a good way

    return results

def addExtraAgencyFun(df):
    # Make 20 copies of the first row with the 20 different top agencies in Sweden
    # Make a copy of the first row
    firstRow = df.iloc[0]
    # Make a list of the copies
    rows = [firstRow] * len(topAgencies)
    # Make a dataframe from the list
    df2 = pd.DataFrame(rows)

    # Add the top agencies to the dataframe
    for i, agency in enumerate(topAgencies):
        df2['agency'].iloc[i] = agency
    
    # Concatenate the two dataframes
    df = pd.concat([df, df2], ignore_index=True)

    return df

def autoPred(df):
    df = addExtraAgencyFun(df)
    res = autoModel.predict(df)

    # Convert to a list
    res = res.tolist()

    # Get the last 20 values
    agencyResults = res[-20:]
    res = res[:-20]

    # Get the mean of the agencies
    agencyToResult = {agency:result for agency, result in zip(topAgencies, agencyResults)}
    for agency, result in agencyToResult.items():
        print(agency, str(result))

    # Get the top and bottom 3 agencies with the highest results
    sortedAgencies = sorted(agencyToResult.items(), key=lambda x: x[1])
    meanPrice = sum(agencyResults) / len(agencyResults)
    top3 = sortedAgencies[-5:]
    top3.reverse()

    agencyString = parseAgencyResult(top3, meanPrice)

    return res, agencyString

def parseAgencyResult(top3, meanPrice):
    toReturn = 'To get the most money for your apartment, you should sell it with the help of one of these agencies:\n'
    toReturn += 'Top 5:\n'
    for agency, result in top3:
        diff = result - meanPrice
        toReturn += f'{agency}: {parsePrice(result)} ({parsePrice(diff)} above mean)\n'

    return toReturn

def isValidInput(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt):
    # Street name is a string, all other values are numbers
    if streetName == '':
        return 'Street name is empty'
    # If Street name contains numbers it should fail
    if any(char.isdigit() for char in streetName):
        return 'Only letters are allowed in street name'

    toCheck = [number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt]
    toCheckName = ['number', 'sqm', 'rooms', 'monthlyFee', 'monthlyCost', 'floor', 'yearBuilt']
    for val, name in zip(toCheck, toCheckName):
        MIN = featureToMinMax[name][0]
        MAX = featureToMinMax[name][1]
        if val < MIN:
            return f'{featureToName.get(name)} is too low'
        if val > MAX:
            return f'{featureToName.get(name)} is too high'
    
    return None

def getDates():
    today = date.today()
    # inAMonth = today + timedelta(days=30)
    inAYear = today + timedelta(days=365)
    lastYear = today - timedelta(days=365)
    beforeUkraineWar = '2022-02-24'
    threeYearsAgo = today - timedelta(days=365*3)

    dateToExplanation = {
        today.strftime("%Y-%m-%d") : 'today',
        # inAMonth.strftime("%Y-%m-%d") : 'in a month',
        inAYear.strftime("%Y-%m-%d") : 'in a year',
        lastYear.strftime("%Y-%m-%d") : 'last year',
        threeYearsAgo.strftime("%Y-%m-%d") : 'three years ago',
        beforeUkraineWar : 'before Russia invaded Ukraine',
    }

    return dateToExplanation


def sthlm(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt, auto):
    inputErrors = isValidInput(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt)
    if inputErrors is not None:
        return '0', '', '', inputErrors
    lat, lon = getAddressInfo(streetName, number)
    # If none
    if lat is None or lon is None:
        return '0', '', '', 'Address not found in the OpenStreetMap dataset (Nominatim), please try another address'

    agency = 'Notar' # Make fun if categorical works
    brf = 'BRF Kartboken 1' # TODO: remove
    dates = getDates()
    input_variables = pd.DataFrame(
            columns=columnHeaders)
    
    for soldDate in dates.keys():
        # Parse the input so we can run it through the model
        # Create a dataframe from the input values
        input_variables = input_variables.append(
            pd.DataFrame(
                [[streetName,number,sqm,rooms,soldDate,monthlyFee,monthlyCost,floor,yearBuilt,brf,agency,lat,lon]], columns=columnHeaders))
    
    df = populateApartmentData(input_variables)  
    df = normalizeData(df)

    pricePred = None
    agencyInfo = 'Please use AutoGluon instead of XGBoost to get information about agencies'
    if auto:
        pricePred, agencyInfo = autoPred(df)
    else:
        df = xgbFix(df)
        pricePred = xgboostPred(df)

    explanations = list(dates.values())
    result = [] #
    mainPred = None
    mainExplanation = None
    for i, pred in enumerate(pricePred):
        explanation = explanations[i]
        if i == 0:
            mainExplanation = explanation
            mainPred = pred
        else:
            diff = pred - mainPred
            if diff > 0:
                result.append(f'If sold {explanation} it would have been worth more: {parsePrice(pred)} (+{parsePrice(diff)})')
            else:
                result.append(f'If sold {explanation} it would have been worth less: {parsePrice(pred)} ({parsePrice(diff)})')

            

    return f'Predicted price of the apartment {mainExplanation}: {parsePrice(mainPred)}', '\n'.join(result), agencyInfo, ''



# All features present in the sthlm dataset
numericalInputs = ['number', 'sqm','rooms', 'monthlyFee','monthlyCost','floor','yearBuilt']
inputs = [gr.inputs.Textbox(lines=1, label='streetName')]


    
# Generate the input form
for feature in numericalInputs:
    minVal = featureToMinMax[feature][0]
    maxVal = featureToMinMax[feature][1]
    theLabel = f'{featureToName.get(feature)} (min: {minVal}, max: {maxVal})'
    inputs.append(gr.inputs.Number(default=0, label=theLabel))

# Add a switch to choose between xgboost and autogluon
inputs.append(gr.inputs.Checkbox( label='Use AutoGluon instead of XGBoost', default=False))

# Create the interface
resultOutputs = [gr.outputs.Label(label='Price if sold today'), gr.outputs.Textbox(label='If sold at a different time'), gr.outputs.Textbox(label='Best agencies to use'), gr.outputs.Textbox(label='Error', type='error')]

demo = gr.Interface(
    fn=sthlm,
    title="Stockholm Housing Valuation",
    description="Predict the price of an apartment in Stockholm\nTo get information about which agency to use, please select AutoGluon",
    allow_flagging="never",
    inputs=inputs,
    outputs=resultOutputs)

demo.launch()