File size: 4,459 Bytes
42cd6af
d26d668
0bf77e2
d26d668
cd54791
 
d26d668
cd54791
 
 
 
 
 
 
 
 
 
 
 
 
 
d26d668
0020398
d26d668
 
 
 
 
 
 
 
 
91a0f71
 
d26d668
 
91a0f71
d26d668
9905c12
d26d668
 
 
 
 
 
 
cd54791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bf77e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd54791
 
 
4582e28
0bf77e2
 
cd54791
 
 
 
42cd6af
cd54791
0bf77e2
42cd6af
 
0bf77e2
42cd6af
 
 
 
 
 
 
0bf77e2
42cd6af
 
0bf77e2
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from pysr import pysr, best_row, get_hof
from sklearn.base import BaseEstimator, RegressorMixin
import inspect
import pandas as pd


class PySRRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model_selection="accuracy", **params):
        """Initialize settings for pysr.pysr call.

        :param model_selection: How to select a model. Can be 'accuracy' or 'best'. 'best' will optimize a combination of complexity and accuracy.
        :type model_selection: str
        """
        super().__init__()
        self.model_selection = model_selection
        self.params = params

        # Stored equations:
        self.equations = None

    def __repr__(self):
        if self.equations is None:
            return "PySRRegressor.equations = None"

        equations = self.equations
        selected = ["" for _ in range(len(equations))]
        if self.model_selection == "accuracy":
            chosen_row = -1
        elif self.model_selection == "best":
            chosen_row = equations["score"].idxmax()
        else:
            raise NotImplementedError
        selected[chosen_row] = ">>>>"
        output = "PySRRegressor.equations = [\n"
        repr_equations = pd.DataFrame(
            dict(
                pick=selected,
                score=equations["score"],
                Equation=equations["Equation"],
                MSE=equations["MSE"],
                Complexity=equations["Complexity"],
            )
        )
        output += repr_equations.__repr__()
        output += "\n]"
        return output

    def set_params(self, **params):
        """Set parameters for pysr.pysr call or model_selection strategy."""
        for key, value in params.items():
            if key == "model_selection":
                self.model_selection = value
            self.params[key] = value

        return self

    def get_params(self, deep=True):
        del deep
        return {**self.params, "model_selection": self.model_selection}

    def get_best(self):
        if self.equations is None:
            return 0.0
        if self.model_selection == "accuracy":
            return self.equations.iloc[-1]
        elif self.model_selection == "best":
            return best_row(self.equations)
        else:
            raise NotImplementedError

    def fit(self, X, y, weights=None, variable_names=None):
        """Search for equations to fit the dataset.

        :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
        :type X: np.ndarray/pandas.DataFrame
        :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y.
        :type y: np.ndarray
        :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
        :type weights: np.ndarray
        :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
        :type variable_names: list
        """
        if variable_names is None:
            if "variable_names" in self.params:
                variable_names = self.params["variable_names"]

        self.equations = pysr(
            X=X,
            y=y,
            weights=weights,
            variable_names=variable_names,
            **{k: v for k, v in self.params.items() if k != "variable_names"},
        )
        return self

    def predict(self, X):
        np_format = self.get_best()["lambda_format"]
        return np_format(X)

    def sympy(self):
        return self.get_best()["sympy_format"]

    def jax(self):
        self.equations = get_hof(output_jax_format=True)
        return self.get_best()["jax_format"]

    def pytorch(self):
        self.equations = get_hof(output_torch_format=True)
        return self.get_best()["torch_format"]


# Add the docs from pysr() to PySRRegressor():
_pysr_docstring_split = []
_start_recording = False
for line in inspect.getdoc(pysr).split("\n"):
    # Skip docs on "X" and "y"
    if ":param binary_operators:" in line:
        _start_recording = True
    if ":returns:" in line:
        _start_recording = False
    if _start_recording:
        _pysr_docstring_split.append(line)
_pysr_docstring = "\n\t".join(_pysr_docstring_split)

PySRRegressor.__init__.__doc__ += _pysr_docstring