File size: 14,822 Bytes
725441e
 
 
 
 
 
 
 
 
 
f75cd82
c1e3254
3253628
 
3124988
af62000
725441e
 
 
263ad67
f77e0d0
725441e
3253628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725441e
870119b
 
 
 
 
 
 
 
 
 
 
1c8a8bf
 
3124988
 
83c39e6
3124988
6dd4540
3124988
 
af62000
3124988
af62000
3124988
 
 
af62000
3124988
 
 
 
 
 
725441e
 
83c39e6
725441e
678f4b7
725441e
 
 
 
 
 
c1e3254
3253628
 
 
 
725441e
5e80c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625c426
 
5e80c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f28d13
 
 
 
 
 
 
 
 
 
 
 
 
 
3253628
 
 
2f28d13
 
 
 
3253628
2f28d13
 
 
 
 
 
 
 
 
 
 
 
3253628
2f28d13
 
 
 
 
 
725441e
 
 
 
 
 
bada12c
 
725441e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c7275f
3253628
de35eef
 
3253628
 
20c271f
 
3124988
09153df
 
 
 
 
fa48446
 
870119b
b1e45d9
09153df
 
 
 
0c7275f
8d6856a
263ad67
8d6856a
fc0233a
8d6856a
 
 
 
 
 
725441e
b1e45d9
725441e
b1e45d9
725441e
de8d0db
72698d6
263ad67
72698d6
 
 
dfddd70
 
 
 
 
3253628
dfddd70
72698d6
1c8a8bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1e45d9
1c8a8bf
b1e45d9
3124988
c8018ef
 
 
1c8a8bf
 
 
 
 
 
8766483
 
3124988
1c8a8bf
 
8766483
f8ed147
8766483
1c8a8bf
f8ed147
1c8a8bf
 
 
f8ed147
83c39e6
f8ed147
1c8a8bf
 
 
 
 
72698d6
bcf937e
 
 
 
4a487bf
 
 
 
bcf937e
 
 
 
 
 
870119b
bcf937e
870119b
bcf937e
 
 
3253628
 
a13ff21
3253628
 
 
 
 
 
 
a13ff21
3253628
 
 
 
 
 
 
 
 
263ad67
bcf937e
 
a3e1d9e
725441e
3253628
 
a3e1d9e
3253628
 
 
 
725441e
3253628
725441e
3253628
 
 
 
 
5e80c1b
 
 
 
3253628
3124988
a3e1d9e
3124988
1c8a8bf
3124988
b1e45d9
3124988
 
dbc0b49
202b26f
3124988
b62a97d
3253628
 
3124988
b62a97d
3124988
 
 
 
b1e45d9
3124988
b1e45d9
1c8a8bf
 
3253628
83c39e6
725441e
 
 
 
 
 
3124988
870119b
725441e
 
6d8bcbe
3253628
 
3124988
294f6a8
6d8bcbe
3124988
12299c2
6d8bcbe
725441e
83c39e6
3124988
725441e
 
 
a3e1d9e
725441e
 
3124988
725441e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
import gradio as gr
import numpy as np
from PIL import Image
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib
import hopsworks
from tqdm import tqdm
import xgboost as xgb
from geopy.geocoders import Nominatim
from datetime import date
from datetime import timedelta
from autogluon.tabular import TabularPredictor
import shutil

# Login to hopsworks and get the feature store

# streetName;number;sqm;rooms;soldDate;monthlyFee;monthlyCost;floor;yearBuilt;brf;agency;lat;lon;gdp;unemployment;interestRate
columnHeaders = ['streetName','number','sqm','rooms','soldDate','monthlyFee','monthlyCost','floor','yearBuilt', 'brf','agency','lat','lon'] # ,'gdp','unemployment','interestRate'

featureToMinMax = {
        'sqm': (10, 800),
        'rooms': (1, 20),
        'monthlyFee': (0, 60000),
        'monthlyCost': (0, 20000),
        'floor': (-3, 35),
        'yearBuilt': (1850, 2023),
        'lat': (58.8, 60.2),
        'lon': (17.5, 19.1),
        'gdp': (505.1, 630.14),
        'unemployment': (6.36, 8.66),
        'interestRate': (-0.5, 2.64),
        'number': (0, 300),
        'soldDate': (2010, 2025)
    } # Extracted from the data

featureToName = {
    'number' : 'Street number',
     'sqm' : 'Size of the apartment in square meters',
    'rooms' : 'Number of rooms',
     'monthlyFee' : 'Monthly fee',
    'monthlyCost' : 'Monthly operating cost',
    'floor' : 'Floor',
    'yearBuilt' : 'Year built',
     'streetName' : 'Name of street',
}

topAgencies = ['Fastighetsbyrån','Notar','Svensk Fastighetsförmedling','HusmanHagberg','Länsförsäkringar Fastighetsförmedling','Erik Olsson','SkandiaMäklarna','Svenska Mäklarhuset','Bjurfors','Mäklarhuset','BOSTHLM','Innerstadsspecialisten','MOHV','Mäklarringen','Historiska Hem','Södermäklarna','Karlsson & Uddare','UNIK Fastighetsförmedling','Edward & Partners','Widerlöv']

def downloadAutogluonModel():
    # Download saved Autogluon model from Hopsworks
    project = hopsworks.login() 
    mr = project.get_model_registry()
    temp = mr.get_model("ag_model_20230109", version=5)
    temp_ag_folder_path = temp.download()
    print(temp_ag_folder_path)
    moveFolder(temp_ag_folder_path)

    ag_model = TabularPredictor.load("AutogluonModels/ag_model_20230109") # '/ag_model_20230109'

    return ag_model


def moveFolder(temp_ag_folder_path):
    # Move Autogluon model folder to the correct folder
    original = temp_ag_folder_path
    target = "AutogluonModels/"
    shutil.move(original, target)

def downloadModel():
    # Download saved Autogluon model from Hopsworks 
    project = hopsworks.login() 
    mr = project.get_model_registry()
    temp = mr.get_model("xgboost_model", version=5)
    model_path = temp.download()

    xgb_model = joblib.load(model_path + "/xgboost_model.pkl")
    return xgb_model

def getAddressInfo(streetName, number):
    streetName = cleanAddress(streetName)
    try:
        return getCoordinatesFromAddress(streetName, number)
    except AddressNotFound:
        return None, None

# Adds the financial data to the apartment data
def populateApartmentData(aptDf):
    print('Populating with financial data...')
    gdpDf = pd.read_csv(f'./data/historicalGDP.csv', sep=';')
    unemploymentDf = pd.read_csv(f'./data/historicalUnemployment.csv', sep=';')
    interestRateDf = pd.read_csv(f'./data/historicalInterest.csv', sep=';')
    gdpDf = interpolateTime(gdpDf)
    unemploymentDf = interpolateTime(unemploymentDf)
    interestRateDf = interpolateTime(interestRateDf)
    aptDf['gdp'] = aptDf['soldDate'].apply(getValueFromTime, args=(gdpDf,))
    aptDf['unemployment'] = aptDf['soldDate'].apply(getValueFromTime, args=(unemploymentDf,))
    aptDf['interestRate'] = aptDf['soldDate'].apply(getValueFromTime, args=(interestRateDf,))
    return aptDf
    
def interpolateTime(df):
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    df = df.resample('MS').mean()
    df = df.interpolate(method='time')
    return fixChange(df)

def getValueFromTime(datetime, dataDf):
    # Get the value from the dataDf at the given datetime
    # If the datetime is not in the dataDf, print the datetime and return '0'
    # First, set the day of the datetime to the first day of the month
    # parse datetime to enable replacement
    datetime = pd.to_datetime(datetime)
    datetime = datetime.replace(day=1)
    try:
        return dataDf.loc[datetime, 'value']
    except KeyError:
        # Try adding one month
        nextMonth = datetime.month + 1
        if nextMonth > 12:
            datetime = datetime.replace(month=1)
            datetime = datetime.replace(year=datetime.year + 1)

def fixChange(df):
    # Set change to be the difference between the current and previous price
    df['change'] = df['value'].diff()
    # If the change is Nan set it to 0
    df['change'] = df['change'].fillna(0)
    
    return df

def cleanAddress(x):
    # Remove "-" from the street
    x = ''.join(x.split('-'))
    # Remove all zero width spaces, non-breaking spaces and non-breaking hyphens
    x = x.replace('\u200b', '')
    x = x.replace('\u00a0', '')
    x = x.replace('\u2011', '')
    # Remove all soft hyphens
    x = x.replace('\xad', '')
    x = x.replace('\u200c', '')

    x.strip()
    return x

class AddressNotFound(Exception):
    pass

def getCoordinatesFromAddress(streetName, number):

    HOST_ADDRESS = '165.227.162.37'
    HOST_PORT = '8080'
    EMAIL = '[email protected]'
    DOMAIN = HOST_ADDRESS + ':' + HOST_PORT
    LOCATOR = Nominatim(user_agent=EMAIL, domain=DOMAIN, scheme='http', timeout=10)

    number = str(int(float(number)))
    address = f'{streetName} {number}, Stockholm'
    
    if number == '0':
        address = f'{streetName}, Stockholm'
        
    location = LOCATOR.geocode(address)
    
    if location is None:
        raise AddressNotFound
    else:
        # Return with a precision of 6 decimals (accuracy of <1 meter)
        lat = round(location.latitude, 6)
        lon = round(location.longitude, 6)
        return lat, lon

def dateToFloat(date):
    year, month, day = str(date).split('-')
    day = day.split(' ')[0]
    return int(year) + int(month) / 12 + int(day) / 365

def normalize(x, minVal, maxVal, feature):
    # Not fantastic
    res = (float(x) - minVal) / (maxVal - minVal)
    return min(max(res, 0), 1)

def normalizeData(df):
    # Normalize select numerical values to a value between 0 and 1
    print('Normalizing data...')
    for feature, minMax in tqdm(featureToMinMax.items()):
        min = minMax[0]
        max = minMax[1]
        if feature == 'soldDate':
            df[feature] = df[feature].apply(lambda x: dateToFloat(x))

        df[feature] = df[feature].apply(lambda x: normalize(x, min, max, feature))

    return df

def parsePrice(price):
    featureToMinMaxPrice = {
    'price': (1.5e5, 7e7)
    }
    MIN = featureToMinMaxPrice['price'][0]
    MAX = featureToMinMaxPrice['price'][1]

    price = float(price)
    price = price * (MAX - MIN) + MIN
    return f'{addDotsToPrice(int(price))} SEK'

def addDotsToPrice(price):
    # Takes an int like 1000000 and returns a string like 1.000.000
    toReturn = ''
    price = str(price)
    for i, c in enumerate(price):
        toReturn += c
        if (len(price) - i) % 3 == 1 and i != len(price) - 1 and c != '-':
            toReturn += '.'
    return toReturn
        
        

def xgbFix(df):
    features_to_categorical = ["streetName", "brf", "agency"]

    features_to_float = ["number", "sqm", "rooms", "monthlyFee",
                        "monthlyCost", "floor", "yearBuilt", "gdp", "unemployment",
                        "interestRate", "lat", "lon", "soldDate"]

    df[features_to_categorical] = df[features_to_categorical].astype("category")
    df[features_to_float] = df[features_to_float].astype(float)
    return df


model = downloadModel()
autoModel = downloadAutogluonModel()

def xgboostPred(df):
    # Drop categorical features
    df = df.drop(['streetName', 'brf', 'agency'], axis=1)

    # Save first row as a numpy array

    results = []
    for _,row in df.iterrows():
        input_list = row.to_numpy()
        res = model.predict(np.asarray(input_list).reshape(1, -1))
        results.append(res[0]) # This is not done in a good way

    return results

def addExtraAgencyFun(df):
    # Make 20 copies of the first row with the 20 different top agencies in Sweden
    # Make a copy of the first row
    firstRow = df.iloc[0]
    # Make a list of the copies
    rows = [firstRow] * len(topAgencies)
    # Make a dataframe from the list
    df2 = pd.DataFrame(rows)

    # Add the top agencies to the dataframe
    for i, agency in enumerate(topAgencies):
        df2['agency'].iloc[i] = agency
    
    # Concatenate the two dataframes
    df = pd.concat([df, df2], ignore_index=True)

    return df

def autoPred(df):
    df = addExtraAgencyFun(df)
    res = autoModel.predict(df)

    # Convert to a list
    res = res.tolist()

    # Get the last 20 values
    agencyResults = res[-20:]
    res = res[:-20]

    # Get the mean of the agencies
    agencyToResult = {agency:result for agency, result in zip(topAgencies, agencyResults)}
    for agency, result in agencyToResult.items():
        print(agency, str(result))

    # Get the top and bottom 3 agencies with the highest results
    sortedAgencies = sorted(agencyToResult.items(), key=lambda x: x[1])
    meanPrice = sum(agencyResults) / len(agencyResults)
    top3 = sortedAgencies[-5:]
    top3.reverse()

    agencyString = parseAgencyResult(top3, meanPrice)

    return res, agencyString

def parseAgencyResult(top3, meanPrice):
    toReturn = 'To get the most money for your apartment, you should sell it with the help of one of these agencies:\n'
    toReturn += 'Top 5:\n'
    for agency, result in top3:
        diff = result - meanPrice
        toReturn += f'{agency}: {parsePrice(result)} ({parsePrice(diff)} above mean)\n'

    return toReturn

def isValidInput(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt):
    # Street name is a string, all other values are numbers
    if streetName == '':
        return 'Street name is empty'
    # If Street name contains numbers it should fail
    if any(char.isdigit() for char in streetName):
        return 'Only letters are allowed in street name'

    toCheck = [number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt]
    toCheckName = ['number', 'sqm', 'rooms', 'monthlyFee', 'monthlyCost', 'floor', 'yearBuilt']
    for val, name in zip(toCheck, toCheckName):
        MIN = featureToMinMax[name][0]
        MAX = featureToMinMax[name][1]
        if val < MIN:
            return f'{featureToName.get(name)} is too low'
        if val > MAX:
            return f'{featureToName.get(name)} is too high'
    
    return None

def getDates():
    today = date.today()
    # inAMonth = today + timedelta(days=30)
    inAYear = today + timedelta(days=365)
    lastYear = today - timedelta(days=365)
    beforeUkraineWar = '2022-02-24'
    threeYearsAgo = today - timedelta(days=365*3)

    dateToExplanation = {
        today.strftime("%Y-%m-%d") : 'today',
        # inAMonth.strftime("%Y-%m-%d") : 'in a month',
        inAYear.strftime("%Y-%m-%d") : 'in a year',
        lastYear.strftime("%Y-%m-%d") : 'last year',
        threeYearsAgo.strftime("%Y-%m-%d") : 'three years ago',
        beforeUkraineWar : 'before Russia invaded Ukraine',
    }

    return dateToExplanation


def sthlm(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt, auto):
    inputErrors = isValidInput(streetName, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt)
    if inputErrors is not None:
        return '0', '', '', inputErrors
    lat, lon = getAddressInfo(streetName, number)
    # If none
    if lat is None or lon is None:
        return '0', '', '', 'Address not found in the OpenStreetMap dataset (Nominatim), please try another address'

    agency = 'Notar' # Make fun if categorical works
    brf = 'BRF Kartboken 1' # TODO: remove
    dates = getDates()
    input_variables = pd.DataFrame(
            columns=columnHeaders)
    
    for soldDate in dates.keys():
        # Parse the input so we can run it through the model
        # Create a dataframe from the input values
        input_variables = input_variables.append(
            pd.DataFrame(
                [[streetName,number,sqm,rooms,soldDate,monthlyFee,monthlyCost,floor,yearBuilt,brf,agency,lat,lon]], columns=columnHeaders))
    
    df = populateApartmentData(input_variables)  
    df = normalizeData(df)

    pricePred = None
    agencyInfo = 'Please use AutoGluon instead of XGBoost to get information about agencies'
    if auto:
        pricePred, agencyInfo = autoPred(df)
    else:
        df = xgbFix(df)
        pricePred = xgboostPred(df)

    explanations = list(dates.values())
    result = [] #
    mainPred = None
    mainExplanation = None
    for i, pred in enumerate(pricePred):
        explanation = explanations[i]
        if i == 0:
            mainExplanation = explanation
            mainPred = pred
        else:
            diff = pred - mainPred
            if diff > 0:
                result.append(f'If sold {explanation} it would have been worth more: {parsePrice(pred)} (+{parsePrice(diff)})')
            else:
                result.append(f'If sold {explanation} it would have been worth less: {parsePrice(pred)} ({parsePrice(diff)})')

            

    return f'Predicted price of the apartment {mainExplanation}: {parsePrice(mainPred)}', '\n'.join(result), agencyInfo, ''



# All features present in the sthlm dataset
numericalInputs = ['number', 'sqm','rooms', 'monthlyFee','monthlyCost','floor','yearBuilt']
inputs = [gr.inputs.Textbox(lines=1, label='streetName')]


    
# Generate the input form
for feature in numericalInputs:
    minVal = featureToMinMax[feature][0]
    maxVal = featureToMinMax[feature][1]
    theLabel = f'{featureToName.get(feature)} (min: {minVal}, max: {maxVal})'
    inputs.append(gr.inputs.Number(default=0, label=theLabel))

# Add a switch to choose between xgboost and autogluon
inputs.append(gr.inputs.Checkbox( label='Use AutoGluon instead of XGBoost', default=False))

# Create the interface
resultOutputs = [gr.outputs.Label(label='Price if sold today'), gr.outputs.Textbox(label='If sold at a different time'), gr.outputs.Textbox(label='Best agencies to use'), gr.outputs.Textbox(label='Error', type='error')]

demo = gr.Interface(
    fn=sthlm,
    title="Stockholm Housing Valuation",
    description="Predict the price of an apartment in Stockholm\nTo get information about which agency to use, please select AutoGluon",
    allow_flagging="never",
    inputs=inputs,
    outputs=resultOutputs)

demo.launch()