Spaces:
Sleeping
Sleeping
import json | |
import pandas as pd | |
import numpy as np | |
class HeuristicRegressor: | |
def __init__(self, buckets: dict, cat_cols: list, target: str): | |
self.buckets = buckets | |
self.target = target | |
self.cat_cols = cat_cols | |
def fit(self, data): | |
bucketed_cols = [] | |
for i, (bucket_col, buckets_per_col) in enumerate(self.buckets.items()): | |
data[f"{bucket_col}_bucket"] = pd.cut(data[bucket_col], buckets_per_col) | |
bucketed_cols.append(f"{bucket_col}_bucket") | |
self.target_medians = data.groupby(bucketed_cols + self.cat_cols).agg({self.target: "median"}).reset_index() | |
self.unconditional_median = data[self.target].median() | |
self.target_medians[self.target] = self.target_medians[self.target].fillna(value=self.unconditional_median) | |
for bucketed_col in bucketed_cols: | |
self.target_medians[f"{bucketed_col}_left"] = self.target_medians[bucketed_col].apply(lambda x: x.left).astype(float) | |
self.target_medians[f"{bucketed_col}_right"] = self.target_medians[bucketed_col].apply(lambda x: x.right).astype(float) | |
self.target_medians.drop(columns=bucketed_cols, inplace=True) | |
def predict(self, value_dict: pd.DataFrame): | |
try: | |
boolean_indexer = [True for i in range(self.target_medians.shape[0])] | |
for bucket_col in self.buckets.keys(): | |
value_for_col = value_dict[bucket_col] | |
boolean_indexer = (boolean_indexer | |
& ((self.target_medians[bucket_col + "_bucket_left"] < value_for_col) | |
& (self.target_medians[bucket_col + "_bucket_right"] >= value_for_col))) | |
for cat_col in self.cat_cols: | |
value_for_col = value_dict[cat_col] | |
boolean_indexer = (boolean_indexer & (self.target_medians[cat_col] == value_for_col)) | |
return self.target_medians.loc[boolean_indexer, self.target].values[0] | |
except: | |
return self.unconditional_median | |
def to_json(self): | |
target_medians_dict = self.target_medians.to_dict() | |
return json.dumps({"buckets": self.buckets, | |
"target": self.target, | |
"cat_cols": self.cat_cols, | |
"unconditional_median": self.unconditional_median, | |
"target_medians_dict": target_medians_dict}) | |
def from_json(cls, json_): | |
json_package = json.loads(json_) | |
model = cls(buckets=json_package["buckets"], cat_cols=json_package["cat_cols"], target=json_package["target"]) | |
target_medians = pd.DataFrame.from_dict(json_package["target_medians_dict"]) | |
model.target_medians = target_medians | |
model.unconditional_median = json_package["unconditional_median"] | |
return model |