File size: 2,923 Bytes
7567aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
import pandas as pd
import numpy as np


class HeuristicRegressor:
    def __init__(self, buckets: dict, cat_cols: list, target: str):
        self.buckets = buckets
        self.target = target
        self.cat_cols = cat_cols
    
    def fit(self, data):
        bucketed_cols = []
        for i, (bucket_col, buckets_per_col) in enumerate(self.buckets.items()):
            data[f"{bucket_col}_bucket"] = pd.cut(data[bucket_col], buckets_per_col)
            bucketed_cols.append(f"{bucket_col}_bucket")
        self.target_medians = data.groupby(bucketed_cols + self.cat_cols).agg({self.target: "median"}).reset_index()
        
        self.unconditional_median = data[self.target].median()
        self.target_medians[self.target] = self.target_medians[self.target].fillna(value=self.unconditional_median)
        
        for bucketed_col in bucketed_cols:
            self.target_medians[f"{bucketed_col}_left"] = self.target_medians[bucketed_col].apply(lambda x: x.left).astype(float)
            self.target_medians[f"{bucketed_col}_right"] = self.target_medians[bucketed_col].apply(lambda x: x.right).astype(float)
        self.target_medians.drop(columns=bucketed_cols, inplace=True)
        
    def predict(self, value_dict: pd.DataFrame):
        try:
            boolean_indexer = [True for i in range(self.target_medians.shape[0])]

            for bucket_col in self.buckets.keys():
                value_for_col = value_dict[bucket_col]
                boolean_indexer = (boolean_indexer
                                   & ((self.target_medians[bucket_col + "_bucket_left"] < value_for_col) 
                                      & (self.target_medians[bucket_col + "_bucket_right"] >= value_for_col)))

            for cat_col in self.cat_cols:
                value_for_col = value_dict[cat_col]
                boolean_indexer = (boolean_indexer & (self.target_medians[cat_col] == value_for_col))

            return self.target_medians.loc[boolean_indexer, self.target].values[0]
        except:
            return self.unconditional_median
        
    def to_json(self):
        target_medians_dict = self.target_medians.to_dict()
        return json.dumps({"buckets": self.buckets, 
                           "target": self.target, 
                           "cat_cols": self.cat_cols, 
                           "unconditional_median": self.unconditional_median, 
                           "target_medians_dict": target_medians_dict})    
    
    @classmethod
    def from_json(cls, json_):
        json_package = json.loads(json_)
        model = cls(buckets=json_package["buckets"], cat_cols=json_package["cat_cols"], target=json_package["target"])
        target_medians = pd.DataFrame.from_dict(json_package["target_medians_dict"])
        model.target_medians = target_medians
        model.unconditional_median = json_package["unconditional_median"]
        return model