Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
•
af14165
1
Parent(s):
a47d265
Update parts of test to use ScikitLearn interface
Browse files- pysr/__init__.py +1 -2
- pysr/sr.py +48 -19
- test/test.py +33 -35
pysr/__init__.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from .sr import (
|
2 |
pysr,
|
3 |
-
|
4 |
best,
|
5 |
best_tex,
|
6 |
best_callable,
|
@@ -11,4 +11,3 @@ from .sr import (
|
|
11 |
from .feynman_problems import Problem, FeynmanProblem
|
12 |
from .export_jax import sympy2jax
|
13 |
from .export_torch import sympy2torch
|
14 |
-
from .sklearn import PySRRegressor
|
|
|
1 |
from .sr import (
|
2 |
pysr,
|
3 |
+
PySRRegressor,
|
4 |
best,
|
5 |
best_tex,
|
6 |
best_callable,
|
|
|
11 |
from .feynman_problems import Problem, FeynmanProblem
|
12 |
from .export_jax import sympy2jax
|
13 |
from .export_torch import sympy2torch
|
|
pysr/sr.py
CHANGED
@@ -179,24 +179,35 @@ def run_feature_selection(X, y, select_k_features):
|
|
179 |
return selector.get_support(indices=True)
|
180 |
|
181 |
|
182 |
-
|
183 |
def _escape_filename(filename):
|
184 |
"""Turns a file into a string representation with correctly escaped backslashes"""
|
185 |
str_repr = str(filename)
|
186 |
str_repr = str_repr.replace("\\", "\\\\")
|
187 |
return str_repr
|
188 |
|
|
|
189 |
def best(*args, **kwargs):
|
190 |
-
raise NotImplementedError(
|
|
|
|
|
|
|
191 |
|
192 |
def best_row(*args, **kwargs):
|
193 |
-
raise NotImplementedError(
|
|
|
|
|
|
|
194 |
|
195 |
def best_tex(*args, **kwargs):
|
196 |
-
raise NotImplementedError(
|
|
|
|
|
|
|
197 |
|
198 |
def best_callable(*args, **kwargs):
|
199 |
-
raise NotImplementedError(
|
|
|
|
|
200 |
|
201 |
|
202 |
def _denoise(X, y, Xresampled=None):
|
@@ -647,7 +658,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
647 |
"nout",
|
648 |
"selection",
|
649 |
"variable_names",
|
650 |
-
"julia_project"
|
651 |
]
|
652 |
|
653 |
def __repr__(self):
|
@@ -668,9 +679,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
668 |
dict(
|
669 |
pick=selected,
|
670 |
score=equations["score"],
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
)
|
675 |
)
|
676 |
output += repr_equations.__repr__()
|
@@ -1036,15 +1047,33 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
1036 |
|
1037 |
try:
|
1038 |
if self.multioutput:
|
1039 |
-
all_outputs = [
|
1040 |
-
|
|
|
1041 |
str(self.equation_file) + f".out{i}" + ".bkup",
|
1042 |
sep="|",
|
1043 |
)
|
1044 |
-
|
1045 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1046 |
else:
|
1047 |
all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1048 |
except FileNotFoundError:
|
1049 |
raise RuntimeError(
|
1050 |
"Couldn't find equation file! The equation search likely exited before a single iteration completed."
|
@@ -1079,7 +1108,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
1079 |
]
|
1080 |
|
1081 |
for _, eqn_row in output.iterrows():
|
1082 |
-
eqn = sympify(eqn_row["
|
1083 |
sympy_format.append(eqn)
|
1084 |
|
1085 |
# Numpy:
|
@@ -1113,8 +1142,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
1113 |
)
|
1114 |
torch_format.append(module)
|
1115 |
|
1116 |
-
curMSE = eqn_row["
|
1117 |
-
curComplexity = eqn_row["
|
1118 |
|
1119 |
if lastMSE is None:
|
1120 |
cur_score = 0.0
|
@@ -1134,10 +1163,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
1134 |
output["sympy_format"] = sympy_format
|
1135 |
output["lambda_format"] = lambda_format
|
1136 |
output_cols = [
|
1137 |
-
"
|
1138 |
-
"
|
1139 |
"score",
|
1140 |
-
"
|
1141 |
"sympy_format",
|
1142 |
"lambda_format",
|
1143 |
]
|
|
|
179 |
return selector.get_support(indices=True)
|
180 |
|
181 |
|
|
|
182 |
def _escape_filename(filename):
|
183 |
"""Turns a file into a string representation with correctly escaped backslashes"""
|
184 |
str_repr = str(filename)
|
185 |
str_repr = str_repr.replace("\\", "\\\\")
|
186 |
return str_repr
|
187 |
|
188 |
+
|
189 |
def best(*args, **kwargs):
|
190 |
+
raise NotImplementedError(
|
191 |
+
"`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation."
|
192 |
+
)
|
193 |
+
|
194 |
|
195 |
def best_row(*args, **kwargs):
|
196 |
+
raise NotImplementedError(
|
197 |
+
"`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation."
|
198 |
+
)
|
199 |
+
|
200 |
|
201 |
def best_tex(*args, **kwargs):
|
202 |
+
raise NotImplementedError(
|
203 |
+
"`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation."
|
204 |
+
)
|
205 |
+
|
206 |
|
207 |
def best_callable(*args, **kwargs):
|
208 |
+
raise NotImplementedError(
|
209 |
+
"`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable."
|
210 |
+
)
|
211 |
|
212 |
|
213 |
def _denoise(X, y, Xresampled=None):
|
|
|
658 |
"nout",
|
659 |
"selection",
|
660 |
"variable_names",
|
661 |
+
"julia_project",
|
662 |
]
|
663 |
|
664 |
def __repr__(self):
|
|
|
679 |
dict(
|
680 |
pick=selected,
|
681 |
score=equations["score"],
|
682 |
+
equation=equations["equation"],
|
683 |
+
loss=equations["loss"],
|
684 |
+
complexity=equations["complexity"],
|
685 |
)
|
686 |
)
|
687 |
output += repr_equations.__repr__()
|
|
|
1047 |
|
1048 |
try:
|
1049 |
if self.multioutput:
|
1050 |
+
all_outputs = []
|
1051 |
+
for i in range(1, self.nout + 1):
|
1052 |
+
df = pd.read_csv(
|
1053 |
str(self.equation_file) + f".out{i}" + ".bkup",
|
1054 |
sep="|",
|
1055 |
)
|
1056 |
+
# Rename Complexity column to complexity:
|
1057 |
+
df.rename(
|
1058 |
+
columns={
|
1059 |
+
"Complexity": "complexity",
|
1060 |
+
"MSE": "loss",
|
1061 |
+
"Equation": "equation",
|
1062 |
+
},
|
1063 |
+
inplace=True,
|
1064 |
+
)
|
1065 |
+
|
1066 |
+
all_outputs.append(df)
|
1067 |
else:
|
1068 |
all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
|
1069 |
+
all_outputs[-1].rename(
|
1070 |
+
columns={
|
1071 |
+
"Complexity": "complexity",
|
1072 |
+
"MSE": "loss",
|
1073 |
+
"Equation": "equation",
|
1074 |
+
},
|
1075 |
+
inplace=True,
|
1076 |
+
)
|
1077 |
except FileNotFoundError:
|
1078 |
raise RuntimeError(
|
1079 |
"Couldn't find equation file! The equation search likely exited before a single iteration completed."
|
|
|
1108 |
]
|
1109 |
|
1110 |
for _, eqn_row in output.iterrows():
|
1111 |
+
eqn = sympify(eqn_row["equation"], locals=local_sympy_mappings)
|
1112 |
sympy_format.append(eqn)
|
1113 |
|
1114 |
# Numpy:
|
|
|
1142 |
)
|
1143 |
torch_format.append(module)
|
1144 |
|
1145 |
+
curMSE = eqn_row["loss"]
|
1146 |
+
curComplexity = eqn_row["complexity"]
|
1147 |
|
1148 |
if lastMSE is None:
|
1149 |
cur_score = 0.0
|
|
|
1163 |
output["sympy_format"] = sympy_format
|
1164 |
output["lambda_format"] = lambda_format
|
1165 |
output_cols = [
|
1166 |
+
"complexity",
|
1167 |
+
"loss",
|
1168 |
"score",
|
1169 |
+
"equation",
|
1170 |
"sympy_format",
|
1171 |
"lambda_format",
|
1172 |
]
|
test/test.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import unittest
|
2 |
from unittest.mock import patch
|
3 |
import numpy as np
|
4 |
-
from pysr import
|
5 |
-
from pysr.sr import run_feature_selection, _handle_feature_selection
|
6 |
import sympy
|
7 |
from sympy import lambdify
|
8 |
import pandas as pd
|
@@ -21,32 +21,33 @@ class TestPipeline(unittest.TestCase):
|
|
21 |
|
22 |
def test_linear_relation(self):
|
23 |
y = self.X[:, 0]
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
27 |
|
28 |
def test_multiprocessing(self):
|
29 |
y = self.X[:, 0]
|
30 |
-
|
31 |
-
|
32 |
-
)
|
33 |
-
|
34 |
-
self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
|
35 |
|
36 |
def test_multioutput_custom_operator(self):
|
37 |
y = self.X[:, [0, 1]] ** 2
|
38 |
-
|
39 |
-
self.X,
|
40 |
-
y,
|
41 |
unary_operators=["sq(x) = x^2"],
|
42 |
-
binary_operators=["plus"],
|
43 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
|
|
44 |
**self.default_test_kwargs,
|
45 |
procs=0,
|
46 |
)
|
|
|
|
|
47 |
print(equations)
|
48 |
-
self.assertLessEqual(equations[0].iloc[-1]["
|
49 |
-
self.assertLessEqual(equations[1].iloc[-1]["
|
50 |
|
51 |
def test_multioutput_weighted_with_callable_temp_equation(self):
|
52 |
y = self.X[:, [0, 1]] ** 2
|
@@ -58,10 +59,7 @@ class TestPipeline(unittest.TestCase):
|
|
58 |
y = (2 - w) * y
|
59 |
# Thus, pysr needs to use the weights to find the right equation!
|
60 |
|
61 |
-
|
62 |
-
self.X,
|
63 |
-
y,
|
64 |
-
weights=w,
|
65 |
unary_operators=["sq(x) = x^2"],
|
66 |
binary_operators=["plus"],
|
67 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
@@ -70,12 +68,13 @@ class TestPipeline(unittest.TestCase):
|
|
70 |
temp_equation_file=True,
|
71 |
delete_tempfiles=False,
|
72 |
)
|
|
|
73 |
|
74 |
np.testing.assert_almost_equal(
|
75 |
-
|
76 |
)
|
77 |
np.testing.assert_almost_equal(
|
78 |
-
|
79 |
)
|
80 |
|
81 |
def test_empty_operators_single_input_sklearn(self):
|
@@ -108,9 +107,7 @@ class TestPipeline(unittest.TestCase):
|
|
108 |
|
109 |
np.random.seed(1)
|
110 |
y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
|
111 |
-
|
112 |
-
self.X,
|
113 |
-
y,
|
114 |
# Test that passing a single operator works:
|
115 |
unary_operators="sq(x) = x^2",
|
116 |
binary_operators="plus",
|
@@ -119,8 +116,9 @@ class TestPipeline(unittest.TestCase):
|
|
119 |
procs=0,
|
120 |
denoise=True,
|
121 |
)
|
122 |
-
self.
|
123 |
-
self.assertLessEqual(
|
|
|
124 |
|
125 |
def test_pandas_resample(self):
|
126 |
np.random.seed(1)
|
@@ -143,9 +141,7 @@ class TestPipeline(unittest.TestCase):
|
|
143 |
"T": np.random.randn(100),
|
144 |
}
|
145 |
)
|
146 |
-
|
147 |
-
X,
|
148 |
-
y,
|
149 |
unary_operators=[],
|
150 |
binary_operators=["+", "*", "/", "-"],
|
151 |
**self.default_test_kwargs,
|
@@ -153,11 +149,12 @@ class TestPipeline(unittest.TestCase):
|
|
153 |
denoise=True,
|
154 |
select_k_features=2,
|
155 |
)
|
156 |
-
|
157 |
-
self.
|
158 |
-
self.assertIn("
|
159 |
-
self.
|
160 |
-
|
|
|
161 |
self.assertListEqual(list(sorted(fn._selection)), [0, 1])
|
162 |
X2 = pd.DataFrame(
|
163 |
{
|
@@ -167,6 +164,7 @@ class TestPipeline(unittest.TestCase):
|
|
167 |
}
|
168 |
)
|
169 |
self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
|
|
|
170 |
|
171 |
|
172 |
class TestBest(unittest.TestCase):
|
|
|
1 |
import unittest
|
2 |
from unittest.mock import patch
|
3 |
import numpy as np
|
4 |
+
from pysr import PySRRegressor
|
5 |
+
from pysr.sr import run_feature_selection, _handle_feature_selection
|
6 |
import sympy
|
7 |
from sympy import lambdify
|
8 |
import pandas as pd
|
|
|
21 |
|
22 |
def test_linear_relation(self):
|
23 |
y = self.X[:, 0]
|
24 |
+
model = PySRRegressor(**self.default_test_kwargs)
|
25 |
+
model.fit(self.X, y)
|
26 |
+
model.set_params(model_selection="accuracy")
|
27 |
+
print(model.equations)
|
28 |
+
self.assertLessEqual(model.get_best()["loss"], 1e-4)
|
29 |
|
30 |
def test_multiprocessing(self):
|
31 |
y = self.X[:, 0]
|
32 |
+
model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False)
|
33 |
+
model.fit(self.X, y)
|
34 |
+
print(model.equations)
|
35 |
+
self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4)
|
|
|
36 |
|
37 |
def test_multioutput_custom_operator(self):
|
38 |
y = self.X[:, [0, 1]] ** 2
|
39 |
+
model = PySRRegressor(
|
|
|
|
|
40 |
unary_operators=["sq(x) = x^2"],
|
|
|
41 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
42 |
+
binary_operators=["plus"],
|
43 |
**self.default_test_kwargs,
|
44 |
procs=0,
|
45 |
)
|
46 |
+
model.fit(self.X, y)
|
47 |
+
equations = model.equations
|
48 |
print(equations)
|
49 |
+
self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
|
50 |
+
self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)
|
51 |
|
52 |
def test_multioutput_weighted_with_callable_temp_equation(self):
|
53 |
y = self.X[:, [0, 1]] ** 2
|
|
|
59 |
y = (2 - w) * y
|
60 |
# Thus, pysr needs to use the weights to find the right equation!
|
61 |
|
62 |
+
model = PySRRegressor(
|
|
|
|
|
|
|
63 |
unary_operators=["sq(x) = x^2"],
|
64 |
binary_operators=["plus"],
|
65 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
|
|
68 |
temp_equation_file=True,
|
69 |
delete_tempfiles=False,
|
70 |
)
|
71 |
+
model.fit(self.X, y, weights=w)
|
72 |
|
73 |
np.testing.assert_almost_equal(
|
74 |
+
model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
|
75 |
)
|
76 |
np.testing.assert_almost_equal(
|
77 |
+
model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
|
78 |
)
|
79 |
|
80 |
def test_empty_operators_single_input_sklearn(self):
|
|
|
107 |
|
108 |
np.random.seed(1)
|
109 |
y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
|
110 |
+
model = PySRRegressor(
|
|
|
|
|
111 |
# Test that passing a single operator works:
|
112 |
unary_operators="sq(x) = x^2",
|
113 |
binary_operators="plus",
|
|
|
116 |
procs=0,
|
117 |
denoise=True,
|
118 |
)
|
119 |
+
model.fit(self.X, y)
|
120 |
+
self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
|
121 |
+
self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
|
122 |
|
123 |
def test_pandas_resample(self):
|
124 |
np.random.seed(1)
|
|
|
141 |
"T": np.random.randn(100),
|
142 |
}
|
143 |
)
|
144 |
+
model = PySRRegressor(
|
|
|
|
|
145 |
unary_operators=[],
|
146 |
binary_operators=["+", "*", "/", "-"],
|
147 |
**self.default_test_kwargs,
|
|
|
149 |
denoise=True,
|
150 |
select_k_features=2,
|
151 |
)
|
152 |
+
model.fit(X, y)
|
153 |
+
self.assertNotIn("unused_feature", model.latex())
|
154 |
+
self.assertIn("T", model.latex())
|
155 |
+
self.assertIn("x", model.latex())
|
156 |
+
self.assertLessEqual(model.get_best()["loss"], 1e-2)
|
157 |
+
fn = model.get_best()['lambda_format']
|
158 |
self.assertListEqual(list(sorted(fn._selection)), [0, 1])
|
159 |
X2 = pd.DataFrame(
|
160 |
{
|
|
|
164 |
}
|
165 |
)
|
166 |
self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
|
167 |
+
self.assertLess(np.average((model.predict(X2) - true_fn(X2)) ** 2), 1e-2)
|
168 |
|
169 |
|
170 |
class TestBest(unittest.TestCase):
|