File size: 7,777 Bytes
2f38c9c
eae8f9c
bed9614
58834e8
eae8f9c
05cf610
6a4fa2c
1adfa85
bed9614
7d4300a
2f38c9c
 
 
 
 
 
 
 
 
 
7d4300a
2f38c9c
 
 
 
7d4300a
10ff16a
8e088d6
6146f6b
 
8e088d6
6146f6b
 
 
 
2f38c9c
7d4300a
 
 
 
 
 
 
 
 
 
2f38c9c
7d4300a
 
ddb4d52
d85c1a5
7d4300a
6a4fa2c
 
 
 
 
5af6354
6a4fa2c
 
5af6354
7d4300a
 
 
 
 
 
 
 
4c67c21
d85c1a5
7d4300a
6a4fa2c
 
7d4300a
 
6a4fa2c
7d4300a
 
6a4fa2c
58834e8
a232b56
 
58834e8
 
a232b56
 
 
 
58834e8
7d4300a
58834e8
faa83d3
8cfda07
5750d1a
 
 
00122b5
5750d1a
 
 
ffd9cd1
 
 
5750d1a
 
 
 
 
00122b5
 
5750d1a
ffd9cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561e614
ffd9cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1adfa85
 
 
7d4300a
 
 
 
 
 
 
1adfa85
7d4300a
 
 
1adfa85
 
7d4300a
 
 
 
 
 
 
 
1adfa85
 
7d4300a
 
1adfa85
 
7d4300a
 
1adfa85
 
 
7d4300a
a626763
51a6b05
97e6589
 
 
51a6b05
97e6589
51a6b05
 
 
7d4300a
97e6589
 
 
 
 
7d4300a
 
5fac847
7d4300a
 
5af6354
7d4300a
 
c96b30c
ef7a292
7d4300a
97e6589
7d4300a
 
eae8f9c
 
717bfae
 
eae8f9c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import unittest
from unittest.mock import patch
import numpy as np
from pysr import pysr, get_hof, best, best_tex, best_callable, best_row, PySRRegressor
from pysr.sr import run_feature_selection, _handle_feature_selection, _yesno
import sympy
from sympy import lambdify
import pandas as pd


class TestPipeline(unittest.TestCase):
    def setUp(self):
        self.default_test_kwargs = dict(
            niterations=10,
            populations=4,
            annealing=True,
            useFrequency=False,
        )
        np.random.seed(0)
        self.X = np.random.randn(100, 5)

    def test_linear_relation(self):
        y = self.X[:, 0]
        equations = pysr(self.X, y, **self.default_test_kwargs)
        print(equations)
        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)

    def test_multiprocessing(self):
        y = self.X[:, 0]
        equations = pysr(
            self.X, y, **self.default_test_kwargs, procs=2, multithreading=False
        )
        print(equations)
        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)

    def test_multioutput_custom_operator(self):
        y = self.X[:, [0, 1]] ** 2
        equations = pysr(
            self.X,
            y,
            unary_operators=["sq(x) = x^2"],
            binary_operators=["plus"],
            extra_sympy_mappings={"sq": lambda x: x ** 2},
            **self.default_test_kwargs,
            procs=0,
        )
        print(equations)
        self.assertLessEqual(equations[0].iloc[-1]["MSE"], 1e-4)
        self.assertLessEqual(equations[1].iloc[-1]["MSE"], 1e-4)

    def test_multioutput_weighted_with_callable_temp_equation(self):
        y = self.X[:, [0, 1]] ** 2
        w = np.random.rand(*y.shape)
        w[w < 0.5] = 0.0
        w[w >= 0.5] = 1.0

        # Double equation when weights are 0:
        y = (2 - w) * y
        # Thus, pysr needs to use the weights to find the right equation!

        pysr(
            self.X,
            y,
            weights=w,
            unary_operators=["sq(x) = x^2"],
            binary_operators=["plus"],
            extra_sympy_mappings={"sq": lambda x: x ** 2},
            **self.default_test_kwargs,
            procs=0,
            temp_equation_file=True,
            delete_tempfiles=False,
        )

        np.testing.assert_almost_equal(
            best_callable()[0](self.X), self.X[:, 0] ** 2, decimal=4
        )
        np.testing.assert_almost_equal(
            best_callable()[1](self.X), self.X[:, 1] ** 2, decimal=4
        )

    def test_empty_operators_single_input_sklearn(self):
        X = np.random.randn(100, 1)
        y = X[:, 0] + 3.0
        regressor = PySRRegressor(
            model_selection="accuracy",
            unary_operators=[],
            binary_operators=["plus"],
            **self.default_test_kwargs,
        )
        regressor.fit(X, y)

        self.assertLessEqual(regressor.equations.iloc[-1]["MSE"], 1e-4)
        np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)

    def test_noisy(self):

        np.random.seed(1)
        y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
        equations = pysr(
            self.X,
            y,
            # Test that passing a single operator works:
            unary_operators="sq(x) = x^2",
            binary_operators="plus",
            extra_sympy_mappings={"sq": lambda x: x ** 2},
            **self.default_test_kwargs,
            procs=0,
            denoise=True,
        )
        self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
        self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)

    def test_pandas_resample(self):
        np.random.seed(1)
        X = pd.DataFrame(
            {
                "T": np.random.randn(500),
                "x": np.random.randn(500),
                "unused_feature": np.random.randn(500),
            }
        )
        true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
        y = true_fn(X)
        noise = np.random.randn(500) * 0.01
        y = y + noise
        # Resampled array is a different order of features:
        Xresampled = pd.DataFrame(
            {
                "unused_feature": np.random.randn(100),
                "x": np.random.randn(100),
                "T": np.random.randn(100),
            }
        )
        equations = pysr(
            X,
            y,
            unary_operators=[],
            binary_operators=["+", "*", "/", "-"],
            **self.default_test_kwargs,
            Xresampled=Xresampled,
            denoise=True,
            select_k_features=2,
        )
        self.assertNotIn("unused_feature", best_tex())
        self.assertIn("T", best_tex())
        self.assertIn("x", best_tex())
        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
        fn = best_callable()
        self.assertListEqual(list(sorted(fn._selection)), [0, 1])
        X2 = pd.DataFrame(
            {
                "T": np.random.randn(100),
                "unused_feature": np.random.randn(100),
                "x": np.random.randn(100),
            }
        )
        self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)


class TestBest(unittest.TestCase):
    def setUp(self):
        equations = pd.DataFrame(
            {
                "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
                "MSE": [1.0, 0.1, 1e-5],
                "Complexity": [1, 2, 3],
            }
        )

        equations["Complexity MSE Equation".split(" ")].to_csv(
            "equation_file.csv.bkup", sep="|"
        )

        self.equations = get_hof(
            "equation_file.csv",
            n_features=2,
            variables_names="x0 x1".split(" "),
            extra_sympy_mappings={},
            output_jax_format=False,
            multioutput=False,
            nout=1,
        )

    def test_best(self):
        self.assertEqual(best(self.equations), sympy.cos(sympy.Symbol("x0")) ** 2)
        self.assertEqual(best(), sympy.cos(sympy.Symbol("x0")) ** 2)

    def test_best_tex(self):
        self.assertEqual(best_tex(self.equations), "\\cos^{2}{\\left(x_{0} \\right)}")
        self.assertEqual(best_tex(), "\\cos^{2}{\\left(x_{0} \\right)}")

    def test_best_lambda(self):
        X = np.random.randn(10, 2)
        y = np.cos(X[:, 0]) ** 2
        for f in [best_callable(), best_callable(self.equations)]:
            np.testing.assert_almost_equal(f(X), y, decimal=4)


class TestFeatureSelection(unittest.TestCase):
    def setUp(self):
        np.random.seed(0)

    def test_feature_selection(self):
        X = np.random.randn(20000, 5)
        y = X[:, 2] ** 2 + X[:, 3] ** 2
        selected = run_feature_selection(X, y, select_k_features=2)
        self.assertEqual(sorted(selected), [2, 3])

    def test_feature_selection_handler(self):
        X = np.random.randn(20000, 5)
        y = X[:, 2] ** 2 + X[:, 3] ** 2
        var_names = [f"x{i}" for i in range(5)]
        selected_X, selection = _handle_feature_selection(
            X,
            select_k_features=2,
            variable_names=var_names,
            y=y,
        )
        self.assertTrue((2 in selection) and (3 in selection))
        selected_var_names = [var_names[i] for i in selection]
        self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
        np.testing.assert_array_equal(
            np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
        )


class TestHelperFunctions(unittest.TestCase):
    @patch("builtins.input", side_effect=["y", "n"])
    def test_yesno(self, mock_input):
        # Assert that the yes/no function correctly deals with y/n
        self.assertEqual(_yesno("Test"), True)
        self.assertEqual(_yesno("Test"), False)