MilesCranmer commited on
Commit
af14165
1 Parent(s): a47d265

Update parts of test to use ScikitLearn interface

Browse files
Files changed (3) hide show
  1. pysr/__init__.py +1 -2
  2. pysr/sr.py +48 -19
  3. test/test.py +33 -35
pysr/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
  from .sr import (
2
  pysr,
3
- get_hof,
4
  best,
5
  best_tex,
6
  best_callable,
@@ -11,4 +11,3 @@ from .sr import (
11
  from .feynman_problems import Problem, FeynmanProblem
12
  from .export_jax import sympy2jax
13
  from .export_torch import sympy2torch
14
- from .sklearn import PySRRegressor
 
1
  from .sr import (
2
  pysr,
3
+ PySRRegressor,
4
  best,
5
  best_tex,
6
  best_callable,
 
11
  from .feynman_problems import Problem, FeynmanProblem
12
  from .export_jax import sympy2jax
13
  from .export_torch import sympy2torch
 
pysr/sr.py CHANGED
@@ -179,24 +179,35 @@ def run_feature_selection(X, y, select_k_features):
179
  return selector.get_support(indices=True)
180
 
181
 
182
-
183
  def _escape_filename(filename):
184
  """Turns a file into a string representation with correctly escaped backslashes"""
185
  str_repr = str(filename)
186
  str_repr = str_repr.replace("\\", "\\\\")
187
  return str_repr
188
 
 
189
  def best(*args, **kwargs):
190
- raise NotImplementedError("`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation.")
 
 
 
191
 
192
  def best_row(*args, **kwargs):
193
- raise NotImplementedError("`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation.")
 
 
 
194
 
195
  def best_tex(*args, **kwargs):
196
- raise NotImplementedError("`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation.")
 
 
 
197
 
198
  def best_callable(*args, **kwargs):
199
- raise NotImplementedError("`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable.")
 
 
200
 
201
 
202
  def _denoise(X, y, Xresampled=None):
@@ -647,7 +658,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
647
  "nout",
648
  "selection",
649
  "variable_names",
650
- "julia_project"
651
  ]
652
 
653
  def __repr__(self):
@@ -668,9 +679,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
668
  dict(
669
  pick=selected,
670
  score=equations["score"],
671
- Equation=equations["Equation"],
672
- MSE=equations["MSE"],
673
- Complexity=equations["Complexity"],
674
  )
675
  )
676
  output += repr_equations.__repr__()
@@ -1036,15 +1047,33 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1036
 
1037
  try:
1038
  if self.multioutput:
1039
- all_outputs = [
1040
- pd.read_csv(
 
1041
  str(self.equation_file) + f".out{i}" + ".bkup",
1042
  sep="|",
1043
  )
1044
- for i in range(1, self.nout + 1)
1045
- ]
 
 
 
 
 
 
 
 
 
1046
  else:
1047
  all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
 
 
 
 
 
 
 
 
1048
  except FileNotFoundError:
1049
  raise RuntimeError(
1050
  "Couldn't find equation file! The equation search likely exited before a single iteration completed."
@@ -1079,7 +1108,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1079
  ]
1080
 
1081
  for _, eqn_row in output.iterrows():
1082
- eqn = sympify(eqn_row["Equation"], locals=local_sympy_mappings)
1083
  sympy_format.append(eqn)
1084
 
1085
  # Numpy:
@@ -1113,8 +1142,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1113
  )
1114
  torch_format.append(module)
1115
 
1116
- curMSE = eqn_row["MSE"]
1117
- curComplexity = eqn_row["Complexity"]
1118
 
1119
  if lastMSE is None:
1120
  cur_score = 0.0
@@ -1134,10 +1163,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1134
  output["sympy_format"] = sympy_format
1135
  output["lambda_format"] = lambda_format
1136
  output_cols = [
1137
- "Complexity",
1138
- "MSE",
1139
  "score",
1140
- "Equation",
1141
  "sympy_format",
1142
  "lambda_format",
1143
  ]
 
179
  return selector.get_support(indices=True)
180
 
181
 
 
182
  def _escape_filename(filename):
183
  """Turns a file into a string representation with correctly escaped backslashes"""
184
  str_repr = str(filename)
185
  str_repr = str_repr.replace("\\", "\\\\")
186
  return str_repr
187
 
188
+
189
  def best(*args, **kwargs):
190
+ raise NotImplementedError(
191
+ "`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation."
192
+ )
193
+
194
 
195
  def best_row(*args, **kwargs):
196
+ raise NotImplementedError(
197
+ "`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation."
198
+ )
199
+
200
 
201
  def best_tex(*args, **kwargs):
202
+ raise NotImplementedError(
203
+ "`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation."
204
+ )
205
+
206
 
207
  def best_callable(*args, **kwargs):
208
+ raise NotImplementedError(
209
+ "`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable."
210
+ )
211
 
212
 
213
  def _denoise(X, y, Xresampled=None):
 
658
  "nout",
659
  "selection",
660
  "variable_names",
661
+ "julia_project",
662
  ]
663
 
664
  def __repr__(self):
 
679
  dict(
680
  pick=selected,
681
  score=equations["score"],
682
+ equation=equations["equation"],
683
+ loss=equations["loss"],
684
+ complexity=equations["complexity"],
685
  )
686
  )
687
  output += repr_equations.__repr__()
 
1047
 
1048
  try:
1049
  if self.multioutput:
1050
+ all_outputs = []
1051
+ for i in range(1, self.nout + 1):
1052
+ df = pd.read_csv(
1053
  str(self.equation_file) + f".out{i}" + ".bkup",
1054
  sep="|",
1055
  )
1056
+ # Rename Complexity column to complexity:
1057
+ df.rename(
1058
+ columns={
1059
+ "Complexity": "complexity",
1060
+ "MSE": "loss",
1061
+ "Equation": "equation",
1062
+ },
1063
+ inplace=True,
1064
+ )
1065
+
1066
+ all_outputs.append(df)
1067
  else:
1068
  all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
1069
+ all_outputs[-1].rename(
1070
+ columns={
1071
+ "Complexity": "complexity",
1072
+ "MSE": "loss",
1073
+ "Equation": "equation",
1074
+ },
1075
+ inplace=True,
1076
+ )
1077
  except FileNotFoundError:
1078
  raise RuntimeError(
1079
  "Couldn't find equation file! The equation search likely exited before a single iteration completed."
 
1108
  ]
1109
 
1110
  for _, eqn_row in output.iterrows():
1111
+ eqn = sympify(eqn_row["equation"], locals=local_sympy_mappings)
1112
  sympy_format.append(eqn)
1113
 
1114
  # Numpy:
 
1142
  )
1143
  torch_format.append(module)
1144
 
1145
+ curMSE = eqn_row["loss"]
1146
+ curComplexity = eqn_row["complexity"]
1147
 
1148
  if lastMSE is None:
1149
  cur_score = 0.0
 
1163
  output["sympy_format"] = sympy_format
1164
  output["lambda_format"] = lambda_format
1165
  output_cols = [
1166
+ "complexity",
1167
+ "loss",
1168
  "score",
1169
+ "equation",
1170
  "sympy_format",
1171
  "lambda_format",
1172
  ]
test/test.py CHANGED
@@ -1,8 +1,8 @@
1
  import unittest
2
  from unittest.mock import patch
3
  import numpy as np
4
- from pysr import pysr, get_hof, best, best_tex, best_callable, best_row, PySRRegressor
5
- from pysr.sr import run_feature_selection, _handle_feature_selection, _yesno
6
  import sympy
7
  from sympy import lambdify
8
  import pandas as pd
@@ -21,32 +21,33 @@ class TestPipeline(unittest.TestCase):
21
 
22
  def test_linear_relation(self):
23
  y = self.X[:, 0]
24
- equations = pysr(self.X, y, **self.default_test_kwargs)
25
- print(equations)
26
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
 
 
27
 
28
  def test_multiprocessing(self):
29
  y = self.X[:, 0]
30
- equations = pysr(
31
- self.X, y, **self.default_test_kwargs, procs=2, multithreading=False
32
- )
33
- print(equations)
34
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
35
 
36
  def test_multioutput_custom_operator(self):
37
  y = self.X[:, [0, 1]] ** 2
38
- equations = pysr(
39
- self.X,
40
- y,
41
  unary_operators=["sq(x) = x^2"],
42
- binary_operators=["plus"],
43
  extra_sympy_mappings={"sq": lambda x: x ** 2},
 
44
  **self.default_test_kwargs,
45
  procs=0,
46
  )
 
 
47
  print(equations)
48
- self.assertLessEqual(equations[0].iloc[-1]["MSE"], 1e-4)
49
- self.assertLessEqual(equations[1].iloc[-1]["MSE"], 1e-4)
50
 
51
  def test_multioutput_weighted_with_callable_temp_equation(self):
52
  y = self.X[:, [0, 1]] ** 2
@@ -58,10 +59,7 @@ class TestPipeline(unittest.TestCase):
58
  y = (2 - w) * y
59
  # Thus, pysr needs to use the weights to find the right equation!
60
 
61
- pysr(
62
- self.X,
63
- y,
64
- weights=w,
65
  unary_operators=["sq(x) = x^2"],
66
  binary_operators=["plus"],
67
  extra_sympy_mappings={"sq": lambda x: x ** 2},
@@ -70,12 +68,13 @@ class TestPipeline(unittest.TestCase):
70
  temp_equation_file=True,
71
  delete_tempfiles=False,
72
  )
 
73
 
74
  np.testing.assert_almost_equal(
75
- best_callable()[0](self.X), self.X[:, 0] ** 2, decimal=4
76
  )
77
  np.testing.assert_almost_equal(
78
- best_callable()[1](self.X), self.X[:, 1] ** 2, decimal=4
79
  )
80
 
81
  def test_empty_operators_single_input_sklearn(self):
@@ -108,9 +107,7 @@ class TestPipeline(unittest.TestCase):
108
 
109
  np.random.seed(1)
110
  y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
111
- equations = pysr(
112
- self.X,
113
- y,
114
  # Test that passing a single operator works:
115
  unary_operators="sq(x) = x^2",
116
  binary_operators="plus",
@@ -119,8 +116,9 @@ class TestPipeline(unittest.TestCase):
119
  procs=0,
120
  denoise=True,
121
  )
122
- self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
123
- self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
 
124
 
125
  def test_pandas_resample(self):
126
  np.random.seed(1)
@@ -143,9 +141,7 @@ class TestPipeline(unittest.TestCase):
143
  "T": np.random.randn(100),
144
  }
145
  )
146
- equations = pysr(
147
- X,
148
- y,
149
  unary_operators=[],
150
  binary_operators=["+", "*", "/", "-"],
151
  **self.default_test_kwargs,
@@ -153,11 +149,12 @@ class TestPipeline(unittest.TestCase):
153
  denoise=True,
154
  select_k_features=2,
155
  )
156
- self.assertNotIn("unused_feature", best_tex())
157
- self.assertIn("T", best_tex())
158
- self.assertIn("x", best_tex())
159
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
160
- fn = best_callable()
 
161
  self.assertListEqual(list(sorted(fn._selection)), [0, 1])
162
  X2 = pd.DataFrame(
163
  {
@@ -167,6 +164,7 @@ class TestPipeline(unittest.TestCase):
167
  }
168
  )
169
  self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
 
170
 
171
 
172
  class TestBest(unittest.TestCase):
 
1
  import unittest
2
  from unittest.mock import patch
3
  import numpy as np
4
+ from pysr import PySRRegressor
5
+ from pysr.sr import run_feature_selection, _handle_feature_selection
6
  import sympy
7
  from sympy import lambdify
8
  import pandas as pd
 
21
 
22
  def test_linear_relation(self):
23
  y = self.X[:, 0]
24
+ model = PySRRegressor(**self.default_test_kwargs)
25
+ model.fit(self.X, y)
26
+ model.set_params(model_selection="accuracy")
27
+ print(model.equations)
28
+ self.assertLessEqual(model.get_best()["loss"], 1e-4)
29
 
30
  def test_multiprocessing(self):
31
  y = self.X[:, 0]
32
+ model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False)
33
+ model.fit(self.X, y)
34
+ print(model.equations)
35
+ self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4)
 
36
 
37
  def test_multioutput_custom_operator(self):
38
  y = self.X[:, [0, 1]] ** 2
39
+ model = PySRRegressor(
 
 
40
  unary_operators=["sq(x) = x^2"],
 
41
  extra_sympy_mappings={"sq": lambda x: x ** 2},
42
+ binary_operators=["plus"],
43
  **self.default_test_kwargs,
44
  procs=0,
45
  )
46
+ model.fit(self.X, y)
47
+ equations = model.equations
48
  print(equations)
49
+ self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
50
+ self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)
51
 
52
  def test_multioutput_weighted_with_callable_temp_equation(self):
53
  y = self.X[:, [0, 1]] ** 2
 
59
  y = (2 - w) * y
60
  # Thus, pysr needs to use the weights to find the right equation!
61
 
62
+ model = PySRRegressor(
 
 
 
63
  unary_operators=["sq(x) = x^2"],
64
  binary_operators=["plus"],
65
  extra_sympy_mappings={"sq": lambda x: x ** 2},
 
68
  temp_equation_file=True,
69
  delete_tempfiles=False,
70
  )
71
+ model.fit(self.X, y, weights=w)
72
 
73
  np.testing.assert_almost_equal(
74
+ model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
75
  )
76
  np.testing.assert_almost_equal(
77
+ model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
78
  )
79
 
80
  def test_empty_operators_single_input_sklearn(self):
 
107
 
108
  np.random.seed(1)
109
  y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
110
+ model = PySRRegressor(
 
 
111
  # Test that passing a single operator works:
112
  unary_operators="sq(x) = x^2",
113
  binary_operators="plus",
 
116
  procs=0,
117
  denoise=True,
118
  )
119
+ model.fit(self.X, y)
120
+ self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
121
+ self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
122
 
123
  def test_pandas_resample(self):
124
  np.random.seed(1)
 
141
  "T": np.random.randn(100),
142
  }
143
  )
144
+ model = PySRRegressor(
 
 
145
  unary_operators=[],
146
  binary_operators=["+", "*", "/", "-"],
147
  **self.default_test_kwargs,
 
149
  denoise=True,
150
  select_k_features=2,
151
  )
152
+ model.fit(X, y)
153
+ self.assertNotIn("unused_feature", model.latex())
154
+ self.assertIn("T", model.latex())
155
+ self.assertIn("x", model.latex())
156
+ self.assertLessEqual(model.get_best()["loss"], 1e-2)
157
+ fn = model.get_best()['lambda_format']
158
  self.assertListEqual(list(sorted(fn._selection)), [0, 1])
159
  X2 = pd.DataFrame(
160
  {
 
164
  }
165
  )
166
  self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
167
+ self.assertLess(np.average((model.predict(X2) - true_fn(X2)) ** 2), 1e-2)
168
 
169
 
170
  class TestBest(unittest.TestCase):