Spaces:
Sleeping
Sleeping
Create heuristic.py
Browse files- heuristic.py +60 -0
heuristic.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class HeuristicRegressor:
|
7 |
+
def __init__(self, buckets: dict, cat_cols: list, target: str):
|
8 |
+
self.buckets = buckets
|
9 |
+
self.target = target
|
10 |
+
self.cat_cols = cat_cols
|
11 |
+
|
12 |
+
def fit(self, data):
|
13 |
+
bucketed_cols = []
|
14 |
+
for i, (bucket_col, buckets_per_col) in enumerate(self.buckets.items()):
|
15 |
+
data[f"{bucket_col}_bucket"] = pd.cut(data[bucket_col], buckets_per_col)
|
16 |
+
bucketed_cols.append(f"{bucket_col}_bucket")
|
17 |
+
self.target_medians = data.groupby(bucketed_cols + self.cat_cols).agg({self.target: "median"}).reset_index()
|
18 |
+
|
19 |
+
self.unconditional_median = data[self.target].median()
|
20 |
+
self.target_medians[self.target] = self.target_medians[self.target].fillna(value=self.unconditional_median)
|
21 |
+
|
22 |
+
for bucketed_col in bucketed_cols:
|
23 |
+
self.target_medians[f"{bucketed_col}_left"] = self.target_medians[bucketed_col].apply(lambda x: x.left).astype(float)
|
24 |
+
self.target_medians[f"{bucketed_col}_right"] = self.target_medians[bucketed_col].apply(lambda x: x.right).astype(float)
|
25 |
+
self.target_medians.drop(columns=bucketed_cols, inplace=True)
|
26 |
+
|
27 |
+
def predict(self, value_dict: pd.DataFrame):
|
28 |
+
try:
|
29 |
+
boolean_indexer = [True for i in range(self.target_medians.shape[0])]
|
30 |
+
|
31 |
+
for bucket_col in self.buckets.keys():
|
32 |
+
value_for_col = value_dict[bucket_col]
|
33 |
+
boolean_indexer = (boolean_indexer
|
34 |
+
& ((self.target_medians[bucket_col + "_bucket_left"] < value_for_col)
|
35 |
+
& (self.target_medians[bucket_col + "_bucket_right"] >= value_for_col)))
|
36 |
+
|
37 |
+
for cat_col in self.cat_cols:
|
38 |
+
value_for_col = value_dict[cat_col]
|
39 |
+
boolean_indexer = (boolean_indexer & (self.target_medians[cat_col] == value_for_col))
|
40 |
+
|
41 |
+
return self.target_medians.loc[boolean_indexer, self.target].values[0]
|
42 |
+
except:
|
43 |
+
return self.unconditional_median
|
44 |
+
|
45 |
+
def to_json(self):
|
46 |
+
target_medians_dict = self.target_medians.to_dict()
|
47 |
+
return json.dumps({"buckets": self.buckets,
|
48 |
+
"target": self.target,
|
49 |
+
"cat_cols": self.cat_cols,
|
50 |
+
"unconditional_median": self.unconditional_median,
|
51 |
+
"target_medians_dict": target_medians_dict})
|
52 |
+
|
53 |
+
@classmethod
|
54 |
+
def from_json(cls, json_):
|
55 |
+
json_package = json.loads(json_)
|
56 |
+
model = cls(buckets=json_package["buckets"], cat_cols=json_package["cat_cols"], target=json_package["target"])
|
57 |
+
target_medians = pd.DataFrame.from_dict(json_package["target_medians_dict"])
|
58 |
+
model.target_medians = target_medians
|
59 |
+
model.unconditional_median = json_package["unconditional_median"]
|
60 |
+
return model
|