Spaces:
Sleeping
Sleeping
# Build all weather data from file | |
def build_weather_data(filename): | |
# Use pandas to read file | |
weather_data = pd.read_csv(filename) | |
# Quickly aggregate Year, Month, Day into a datetime object | |
# This is because the 311 data uses datetime | |
weather_data["Datetime"] = weather_data["Year"].astype("str") + "-" + weather_data["Month"].astype("str") + "-" + weather_data["Day"].astype("str") | |
weather_data = create_datetime(weather_data, "Datetime", format="%Y-%m-%d") | |
# LOCALIZE | |
# Pre-recorded min/max values from the service data (so we don't need again) | |
lat_min = 40.49804421521046 | |
lat_max = 40.91294056699566 | |
long_min = -74.25521082506387 | |
long_max = -73.70038354802529 | |
# Create the conditions for location matching | |
mincon_lat = weather_data["Latitude"] >= lat_min | |
maxcon_lat = weather_data["Latitude"] <= lat_max | |
mincon_long = weather_data["Longitude"] >= long_min | |
maxcon_long = weather_data["Longitude"] <= long_max | |
# Localize our data to match the service data | |
wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long] | |
drop_cols = [ | |
"USAF", | |
"WBAN", | |
"StationName", | |
"State", | |
"Latitude", | |
"Longitude" | |
] | |
wd_localized = wd_localized.drop(columns=drop_cols) | |
# AGGREGATE | |
# Map columns with aggregation method | |
mean_cols = [ | |
'MeanTemp', | |
'DewPoint', | |
'Percipitation', | |
'WindSpeed', | |
'Gust', | |
'SnowDepth', | |
] | |
min_cols = [ | |
'MinTemp' | |
] | |
max_cols = [ | |
'MaxTemp', | |
'MaxSustainedWind' | |
] | |
round_cols = [ | |
'Rain', | |
'SnowIce' | |
] | |
# Perform Aggregation | |
mean_df = wd_localized.groupby("Datetime")[mean_cols].mean() | |
min_df = wd_localized.groupby("Datetime")[min_cols].min() | |
max_df = wd_localized.groupby("Datetime")[max_cols].max() | |
round_df = wd_localized.groupby("Datetime")[round_cols].mean().round().astype(np.int8) | |
wd_full = pd.concat([mean_df, min_df, max_df, round_df], axis=1) | |
# Add seasonal features | |
wd_full = build_temporal_features(wd_full, "Datetime") | |
wd_full["Season"] = wd_full["Season"].astype("category") | |
wd_full = wd_full.set_index("Datetime") | |
# We will calculate the imputation for the next 7 days after 12/31/2018 | |
# Along with the 49 missing days | |
# This will act as our "Weather Forecast" | |
time_steps = 49 + 7 | |
# Impute Cols | |
impute_cols = [ | |
'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint', | |
'Percipitation', 'WindSpeed', 'MaxSustainedWind', | |
'Gust', 'Rain', 'SnowDepth', 'SnowIce', | |
] | |
# Mean Vars | |
mean_vars = ["WindSpeed", "MaxSustainedWind", "Gust", "SnowDepth"] | |
min_vars = ["SnowIce", "MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation"] | |
max_vars = ["Rain"] | |
# Use the imported function to create the imputed data | |
preds_mean = impute_missing_weather(wd_full, strategy="mean", time_steps=time_steps, impute_cols=mean_vars) | |
preds_min = impute_missing_weather(wd_full, strategy="min", time_steps=time_steps, impute_cols=min_vars) | |
preds_max = impute_missing_weather(wd_full, strategy="max", time_steps=time_steps, impute_cols=max_vars) | |
all_preds = pd.concat([preds_mean, preds_min, preds_max], axis=1) | |
all_preds = build_temporal_features(all_preds.loc[:, impute_cols], "Datetime") | |
all_preds = all_preds.set_index("Datetime") | |
wd_curr = wd_full.loc[wd_full["Year"] >= 2016] | |
wd_df = pd.concat([wd_full, all_preds], axis=0, join="outer") | |
return wd_df |