import gradio as gr import numpy as np from PIL import Image import requests import pandas as pd import matplotlib.pyplot as plt import numpy as np import joblib import hopsworks from tqdm import tqdm import xgboost as xgb from geopy.geocoders import Nominatim from datetime import date from datetime import timedelta # Login to hopsworks and get the feature store # TODO: Remove brf # area;streetName;number;sqm;rooms;soldDate;monthlyFee;monthlyCost;floor;yearBuilt;brf;agency;lat;lon;gdp;unemployment;interestRate columnHeaders = ['area','streetName','number','sqm','rooms','soldDate','monthlyFee','monthlyCost','floor','yearBuilt', 'brf','agency','lat','lon','gdp','unemployment','interestRate'] featureToMinMax = { 'sqm': (10, 800), 'rooms': (1, 20), 'monthlyFee': (0, 60000), 'monthlyCost': (0, 20000), 'floor': (-3, 35), 'yearBuilt': (1850, 2023), 'lat': (58.8, 60.2), 'lon': (17.5, 19.1), 'gdp': (505.1, 630.14), 'unemployment': (6.36, 8.66), 'interestRate': (-0.5, 2.64), 'number': (0, 300), 'soldDate': (2010, 2025) } # Extracted from the data def downloadModel(): # Download saved Autogluon model from Hopsworks project = hopsworks.login(api_key_value='OWXnoeaQ1Bg6I0IE.EgaQo2HmubMIzfChCahCK6sQVLs4vyrhj2ODWHcYr0RN9f1gqac2dJjn8p2fXwcQ') # TODO: Remove, lol mr = project.get_model_registry() temp = mr.get_model("xgboost_model", version=5) model_path = temp.download() xgb_model = joblib.load(model_path + "/xgboost_model.pkl") return xgb_model def getAddressInfo(streetName, number): streetName = cleanAddress(streetName) try: return getCoordinatesFromAddress(streetName, number) except AddressNotFound: return None, None def cleanAddress(x): # Remove "-" from the street x = ''.join(x.split('-')) # Remove all zero width spaces, non-breaking spaces and non-breaking hyphens x = x.replace('\u200b', '') x = x.replace('\u00a0', '') x = x.replace('\u2011', '') # Remove all soft hyphens x = x.replace('\xad', '') x = x.replace('\u200c', '') x.strip() return x class AddressNotFound(Exception): pass def getCoordinatesFromAddress(streetName, number): HOST_ADDRESS = '165.227.162.37' HOST_PORT = '8080' EMAIL = 'nathan.allard@gmail.com' DOMAIN = HOST_ADDRESS + ':' + HOST_PORT LOCATOR = Nominatim(user_agent=EMAIL, domain=DOMAIN, scheme='http', timeout=10) number = str(int(float(number))) address = f'{streetName} {number}, Stockholm' if number == '0': address = f'{streetName}, Stockholm' location = LOCATOR.geocode(address) if location is None: raise AddressNotFound else: # Return with a precision of 6 decimals (accuracy of <1 meter) lat = round(location.latitude, 6) lon = round(location.longitude, 6) return lat, lon def getFinancialInfo(date): gdp, unemployment, interestRate = None, None, None return 600.0, 7.0, 0 def dateToFloat(date): year, month, day = str(date).split('-') day = day.split(' ')[0] return int(year) + int(month) / 12 + int(day) / 365 def normalize(x, minVal, maxVal, feature): # Not fantastic res = (float(x) - minVal) / (maxVal - minVal) return min(max(res, 0), 1) def normalizeData(df): # We do this manually because we want the UI to be able to transform the input data the same way # Normalize select numerical values to a value between 0 and 1 print('Normalizing data...') for feature, minMax in tqdm(featureToMinMax.items()): min = minMax[0] max = minMax[1] if feature == 'soldDate': df[feature] = df[feature].apply(lambda x: dateToFloat(x)) df[feature] = df[feature].apply(lambda x: normalize(x, min, max, feature)) return df def parsePrice(price): featureToMinMaxPrice = { 'price': (1.5e5, 7e7) } MIN = featureToMinMaxPrice['price'][0] MAX = featureToMinMaxPrice['price'][1] price = price * MAX + MIN return f'{str(int(price))} SEK' def xgbFix(df): features_to_categorical = ["area", "streetName", "brf", "agency"] features_to_float = ["number", "sqm", "rooms", "monthlyFee", "monthlyCost", "floor", "yearBuilt", "gdp", "unemployment", "interestRate", "lat", "lon", "soldDate"] df[features_to_categorical] = df[features_to_categorical].astype("category") df[features_to_float] = df[features_to_float].astype(float) return df model = downloadModel() def xgboostPred(df, explanation): # Drop categorical features df = df.drop(['area', 'streetName', 'brf', 'agency'], axis=1) # Save first row as a numpy array input_list = df.iloc[0].to_numpy() res = model.predict(np.asarray(input_list).reshape(1, -1)) return res def autoPred(): pass def getDates(): today = date.today() inAMonth = today + timedelta(days=30) inAYear = today + timedelta(days=365) lastYear = today - timedelta(days=365) beforeUkraineWar = '2022-02-24' threeYearsAgo = today - timedelta(days=365*3) dateToExplanation = { today.strftime("%Y-%m-%d") : 'today', inAMonth.strftime("%Y-%m-%d") : 'in a month', inAYear.strftime("%Y-%m-%d") : 'in a year', lastYear.strftime("%Y-%m-%d") : 'last year', threeYearsAgo.strftime("%Y-%m-%d") : 'three years ago', beforeUkraineWar : 'before Russia invaded Ukraine', } return dateToExplanation def sthlm(streetName, area, number, sqm, rooms, monthlyFee, monthlyCost, floor, yearBuilt): lat, lon = getAddressInfo(streetName, number) # If none if lat is None or lon is None: return '', 'Address not found in the OpenStreetMap dataset (Nominatim), please try another address' agency = 'Notar' # Make fun if categorical works brf = 'BRF Kartboken 1' # TODO: remove dates = getDates() input_variables = pd.DataFrame( columns=columnHeaders) for soldDate in dates.keys(): gdp, unemployment, interestRate = getFinancialInfo(soldDate) # Parse the input so we can run it through the model # Create a dataframe from the input values input_variables = input_variables.append( pd.DataFrame( [[area,streetName,number,sqm,rooms,soldDate,monthlyFee,monthlyCost,floor,yearBuilt,brf,agency,lat,lon,gdp,unemployment,interestRate]], columns=columnHeaders)) df = normalizeData(input_variables) df = xgbFix(df) pricePred = xgboostPred(df) explanations = dates.values() result = [] for i, pred in enumerate(pricePred): explanation = explanations[i] result.append(f'Predicted price of the apartment {explanation}: {parsePrice(pred)}') return '\n'.join(result), '' # All features present in the sthlm dataset numericalInputs = ['number', 'sqm','rooms', 'monthlyFee','monthlyCost','floor','yearBuilt'] categoricalInputs = ['area'] inputs = [gr.inputs.Textbox(lines=1, label='streetName')] catToInput = { 'feature': ['Bromma', 'Abrahamsberg', 'Akalla'] } # Generate the input form for feature in categoricalInputs: inputs.append(gr.inputs.Dropdown( choices=catToInput.get('feature'), default="a", label=feature)) for feature in numericalInputs: minVal = featureToMinMax[feature][0] maxVal = featureToMinMax[feature][1] theLabel = f'{feature} (min: {minVal}, max: {maxVal})' inputs.append(gr.inputs.Number(default=0, label=theLabel)) # Create the interface demo = gr.Interface( fn=sthlm, title="Stockholm Housing Valuation", description="Predict the price of an apartment in Stockholm", allow_flagging="never", inputs=inputs, outputs=['text', 'text']) demo.launch()