MilesCranmer commited on
Commit
913bf09
2 Parent(s): f456047 ae0b11e

Merge pull request #117 from MilesCranmer/defaults

Browse files
Files changed (6) hide show
  1. .github/workflows/CI_Windows.yml +1 -1
  2. README.md +1 -1
  3. example.py +1 -1
  4. pysr/sr.py +30 -23
  5. pysr/version.py +2 -2
  6. test/test.py +38 -36
.github/workflows/CI_Windows.yml CHANGED
@@ -28,7 +28,7 @@ jobs:
28
  matrix:
29
  julia-version: ['1.7.1']
30
  python-version: ['3.9']
31
- os: [windows-latest]
32
 
33
  steps:
34
  - uses: actions/[email protected]
 
28
  matrix:
29
  julia-version: ['1.7.1']
30
  python-version: ['3.9']
31
+ os: [windows-2019]
32
 
33
  steps:
34
  - uses: actions/[email protected]
README.md CHANGED
@@ -87,7 +87,7 @@ PySR's main interface is in the style of scikit-learn:
87
  ```python
88
  from pysr import PySRRegressor
89
  model = PySRRegressor(
90
- niterations=5,
91
  binary_operators=["+", "*"],
92
  unary_operators=[
93
  "cos",
 
87
  ```python
88
  from pysr import PySRRegressor
89
  model = PySRRegressor(
90
+ niterations=40,
91
  binary_operators=["+", "*"],
92
  unary_operators=[
93
  "cos",
example.py CHANGED
@@ -6,7 +6,7 @@ y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5
6
  from pysr import PySRRegressor
7
 
8
  model = PySRRegressor(
9
- niterations=5,
10
  binary_operators=["+", "*"],
11
  unary_operators=[
12
  "cos",
 
6
  from pysr import PySRRegressor
7
 
8
  model = PySRRegressor(
9
+ niterations=40,
10
  binary_operators=["+", "*"],
11
  unary_operators=[
12
  "cos",
pysr/sr.py CHANGED
@@ -350,30 +350,30 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
350
  unary_operators=None,
351
  procs=cpu_count(),
352
  loss="L2DistLoss()",
353
- populations=100,
354
- niterations=4,
355
- ncyclesperiteration=100,
356
  timeout_in_seconds=None,
357
  alpha=0.1,
358
  annealing=False,
359
- fractionReplaced=0.01,
360
- fractionReplacedHof=0.005,
361
- npop=100,
362
- parsimony=1e-4,
363
  migration=True,
364
  hofMigration=True,
365
  shouldOptimizeConstants=True,
366
- topn=10,
367
- weightAddNode=1,
368
- weightInsertNode=3,
369
- weightDeleteNode=3,
370
- weightDoNothing=1,
371
- weightMutateConstant=10,
372
- weightMutateOperator=1,
373
- weightRandomize=1,
374
- weightSimplify=0.002,
375
- crossoverProbability=0.01,
376
- perturbationFactor=1.0,
377
  extra_sympy_mappings=None,
378
  extra_torch_mappings=None,
379
  extra_jax_mappings=None,
@@ -391,6 +391,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
391
  warmupMaxsizeBy=0.0,
392
  constraints=None,
393
  useFrequency=True,
 
394
  tempdir=None,
395
  delete_tempfiles=True,
396
  julia_project=None,
@@ -399,11 +400,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
399
  output_jax_format=False,
400
  output_torch_format=False,
401
  optimizer_algorithm="BFGS",
402
- optimizer_nrestarts=3,
403
- optimize_probability=1.0,
404
- optimizer_iterations=10,
405
  tournament_selection_n=10,
406
- tournament_selection_p=1.0,
407
  denoise=False,
408
  Xresampled=None,
409
  precision=32,
@@ -509,6 +510,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
509
  :type constraints: dict
510
  :param useFrequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities.
511
  :type useFrequency: bool
 
 
512
  :param tempdir: directory for the temporary files
513
  :type tempdir: str/None
514
  :param delete_tempfiles: whether to delete the temporary files after finishing
@@ -647,6 +650,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
647
  warmupMaxsizeBy=warmupMaxsizeBy,
648
  constraints=constraints,
649
  useFrequency=useFrequency,
 
650
  tempdir=tempdir,
651
  delete_tempfiles=delete_tempfiles,
652
  update=update,
@@ -756,8 +760,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
756
  for key, value in params.items():
757
  if key in self.surface_parameters:
758
  self.__setattr__(key, value)
759
- else:
760
  self.params[key] = value
 
 
761
 
762
  return self
763
 
@@ -1192,6 +1198,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1192
  shouldOptimizeConstants=self.params["shouldOptimizeConstants"],
1193
  warmupMaxsizeBy=self.params["warmupMaxsizeBy"],
1194
  useFrequency=self.params["useFrequency"],
 
1195
  npop=self.params["npop"],
1196
  ncyclesperiteration=self.params["ncyclesperiteration"],
1197
  fractionReplaced=self.params["fractionReplaced"],
 
350
  unary_operators=None,
351
  procs=cpu_count(),
352
  loss="L2DistLoss()",
353
+ populations=15,
354
+ niterations=40,
355
+ ncyclesperiteration=550,
356
  timeout_in_seconds=None,
357
  alpha=0.1,
358
  annealing=False,
359
+ fractionReplaced=0.000364,
360
+ fractionReplacedHof=0.035,
361
+ npop=33,
362
+ parsimony=0.0032,
363
  migration=True,
364
  hofMigration=True,
365
  shouldOptimizeConstants=True,
366
+ topn=12,
367
+ weightAddNode=0.79,
368
+ weightDeleteNode=1.7,
369
+ weightDoNothing=0.21,
370
+ weightInsertNode=5.1,
371
+ weightMutateConstant=0.048,
372
+ weightMutateOperator=0.47,
373
+ weightRandomize=0.00023,
374
+ weightSimplify=0.0020,
375
+ crossoverProbability=0.066,
376
+ perturbationFactor=0.076,
377
  extra_sympy_mappings=None,
378
  extra_torch_mappings=None,
379
  extra_jax_mappings=None,
 
391
  warmupMaxsizeBy=0.0,
392
  constraints=None,
393
  useFrequency=True,
394
+ useFrequencyInTournament=True,
395
  tempdir=None,
396
  delete_tempfiles=True,
397
  julia_project=None,
 
400
  output_jax_format=False,
401
  output_torch_format=False,
402
  optimizer_algorithm="BFGS",
403
+ optimizer_nrestarts=2,
404
+ optimize_probability=0.14,
405
+ optimizer_iterations=8,
406
  tournament_selection_n=10,
407
+ tournament_selection_p=0.86,
408
  denoise=False,
409
  Xresampled=None,
410
  precision=32,
 
510
  :type constraints: dict
511
  :param useFrequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities.
512
  :type useFrequency: bool
513
+ :param useFrequencyInTournament: whether to use the frequency mentioned above in the tournament, rather than just the simulated annealing.
514
+ :type useFrequencyInTournament: bool
515
  :param tempdir: directory for the temporary files
516
  :type tempdir: str/None
517
  :param delete_tempfiles: whether to delete the temporary files after finishing
 
650
  warmupMaxsizeBy=warmupMaxsizeBy,
651
  constraints=constraints,
652
  useFrequency=useFrequency,
653
+ useFrequencyInTournament=useFrequencyInTournament,
654
  tempdir=tempdir,
655
  delete_tempfiles=delete_tempfiles,
656
  update=update,
 
760
  for key, value in params.items():
761
  if key in self.surface_parameters:
762
  self.__setattr__(key, value)
763
+ elif key in self.params:
764
  self.params[key] = value
765
+ else:
766
+ raise ValueError(f"Parameter {key} is not in the list of parameters.")
767
 
768
  return self
769
 
 
1198
  shouldOptimizeConstants=self.params["shouldOptimizeConstants"],
1199
  warmupMaxsizeBy=self.params["warmupMaxsizeBy"],
1200
  useFrequency=self.params["useFrequency"],
1201
+ useFrequencyInTournament=self.params["useFrequencyInTournament"],
1202
  npop=self.params["npop"],
1203
  ncyclesperiteration=self.params["ncyclesperiteration"],
1204
  fractionReplaced=self.params["fractionReplaced"],
pysr/version.py CHANGED
@@ -1,2 +1,2 @@
1
- __version__ = "0.7.13"
2
- __symbolic_regression_jl_version__ = "0.7.14"
 
1
+ __version__ = "0.8.0"
2
+ __symbolic_regression_jl_version__ = "0.8.7"
test/test.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import unittest
2
  from unittest.mock import patch
3
  import numpy as np
@@ -10,22 +11,26 @@ import pandas as pd
10
 
11
  class TestPipeline(unittest.TestCase):
12
  def setUp(self):
 
 
 
 
 
 
 
 
13
  self.default_test_kwargs = dict(
14
- niterations=10,
15
- populations=100,
16
- ncyclesperiteration=100,
17
- npop=100,
18
- annealing=True,
19
- useFrequency=False,
20
  )
21
- np.random.seed(0)
22
- self.X = np.random.randn(100, 5)
23
 
24
  def test_linear_relation(self):
25
  y = self.X[:, 0]
26
  model = PySRRegressor(**self.default_test_kwargs)
27
  model.fit(self.X, y)
28
- model.set_params(model_selection="accuracy")
29
  print(model.equations)
30
  self.assertLessEqual(model.get_best()["loss"], 1e-4)
31
 
@@ -67,8 +72,9 @@ class TestPipeline(unittest.TestCase):
67
  self.assertGreater(bad_mse, 1e-4)
68
 
69
  def test_multioutput_weighted_with_callable_temp_equation(self):
70
- y = self.X[:, [0, 1]] ** 2
71
- w = np.random.rand(*y.shape)
 
72
  w[w < 0.5] = 0.0
73
  w[w >= 0.5] = 1.0
74
 
@@ -85,20 +91,19 @@ class TestPipeline(unittest.TestCase):
85
  temp_equation_file=True,
86
  delete_tempfiles=False,
87
  )
88
- model.fit(self.X, y, weights=w)
89
 
90
  np.testing.assert_almost_equal(
91
- model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
92
  )
93
  np.testing.assert_almost_equal(
94
- model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
95
  )
96
 
97
  def test_empty_operators_single_input_multirun(self):
98
- X = np.random.randn(100, 1)
99
  y = X[:, 0] + 3.0
100
  regressor = PySRRegressor(
101
- model_selection="accuracy",
102
  unary_operators=[],
103
  binary_operators=["plus"],
104
  **self.default_test_kwargs,
@@ -124,13 +129,9 @@ class TestPipeline(unittest.TestCase):
124
  self.assertTrue("None" not in regressor.__repr__())
125
  self.assertTrue(">>>>" in regressor.__repr__())
126
 
127
- # "best" model_selection should also give a decent loss:
128
- np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
129
-
130
  def test_noisy(self):
131
 
132
- np.random.seed(1)
133
- y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
134
  model = PySRRegressor(
135
  # Test that passing a single operator works:
136
  unary_operators="sq(x) = x^2",
@@ -145,26 +146,25 @@ class TestPipeline(unittest.TestCase):
145
  self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
146
 
147
  def test_pandas_resample(self):
148
- np.random.seed(1)
149
  X = pd.DataFrame(
150
  {
151
- "T": np.random.randn(500),
152
- "x": np.random.randn(500),
153
- "unused_feature": np.random.randn(500),
154
  }
155
  )
156
  true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
157
  y = true_fn(X)
158
- noise = np.random.randn(500) * 0.01
159
  y = y + noise
160
  # We also test y as a pandas array:
161
  y = pd.Series(y)
162
  # Resampled array is a different order of features:
163
  Xresampled = pd.DataFrame(
164
  {
165
- "unused_feature": np.random.randn(100),
166
- "x": np.random.randn(100),
167
- "T": np.random.randn(100),
168
  }
169
  )
170
  model = PySRRegressor(
@@ -184,9 +184,9 @@ class TestPipeline(unittest.TestCase):
184
  self.assertListEqual(list(sorted(fn._selection)), [0, 1])
185
  X2 = pd.DataFrame(
186
  {
187
- "T": np.random.randn(100),
188
- "unused_feature": np.random.randn(100),
189
- "x": np.random.randn(100),
190
  }
191
  )
192
  self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-1)
@@ -212,10 +212,12 @@ class TestBest(unittest.TestCase):
212
  variable_names="x0 x1".split(" "),
213
  extra_sympy_mappings={},
214
  output_jax_format=False,
 
215
  )
216
  self.model.n_features = 2
217
  self.model.refresh()
218
  self.equations = self.model.equations
 
219
 
220
  def test_best(self):
221
  self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
@@ -230,7 +232,7 @@ class TestBest(unittest.TestCase):
230
  self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
231
 
232
  def test_best_lambda(self):
233
- X = np.random.randn(10, 2)
234
  y = np.cos(X[:, 0]) ** 2
235
  for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]:
236
  np.testing.assert_almost_equal(f(X), y, decimal=4)
@@ -238,16 +240,16 @@ class TestBest(unittest.TestCase):
238
 
239
  class TestFeatureSelection(unittest.TestCase):
240
  def setUp(self):
241
- np.random.seed(0)
242
 
243
  def test_feature_selection(self):
244
- X = np.random.randn(20000, 5)
245
  y = X[:, 2] ** 2 + X[:, 3] ** 2
246
  selected = run_feature_selection(X, y, select_k_features=2)
247
  self.assertEqual(sorted(selected), [2, 3])
248
 
249
  def test_feature_selection_handler(self):
250
- X = np.random.randn(20000, 5)
251
  y = X[:, 2] ** 2 + X[:, 3] ** 2
252
  var_names = [f"x{i}" for i in range(5)]
253
  selected_X, selection = _handle_feature_selection(
 
1
+ import inspect
2
  import unittest
3
  from unittest.mock import patch
4
  import numpy as np
 
11
 
12
  class TestPipeline(unittest.TestCase):
13
  def setUp(self):
14
+ # Using inspect,
15
+ # get default niterations from PySRRegressor, and double them:
16
+ default_niterations = (
17
+ inspect.signature(PySRRegressor.__init__).parameters["niterations"].default
18
+ )
19
+ default_populations = (
20
+ inspect.signature(PySRRegressor.__init__).parameters["populations"].default
21
+ )
22
  self.default_test_kwargs = dict(
23
+ model_selection="accuracy",
24
+ niterations=default_niterations * 2,
25
+ populations=default_populations * 2,
 
 
 
26
  )
27
+ self.rstate = np.random.RandomState(0)
28
+ self.X = self.rstate.randn(100, 5)
29
 
30
  def test_linear_relation(self):
31
  y = self.X[:, 0]
32
  model = PySRRegressor(**self.default_test_kwargs)
33
  model.fit(self.X, y)
 
34
  print(model.equations)
35
  self.assertLessEqual(model.get_best()["loss"], 1e-4)
36
 
 
72
  self.assertGreater(bad_mse, 1e-4)
73
 
74
  def test_multioutput_weighted_with_callable_temp_equation(self):
75
+ X = self.X.copy()
76
+ y = X[:, [0, 1]] ** 2
77
+ w = self.rstate.rand(*y.shape)
78
  w[w < 0.5] = 0.0
79
  w[w >= 0.5] = 1.0
80
 
 
91
  temp_equation_file=True,
92
  delete_tempfiles=False,
93
  )
94
+ model.fit(X.copy(), y, weights=w)
95
 
96
  np.testing.assert_almost_equal(
97
+ model.predict(X.copy())[:, 0], X[:, 0] ** 2, decimal=4
98
  )
99
  np.testing.assert_almost_equal(
100
+ model.predict(X.copy())[:, 1], X[:, 1] ** 2, decimal=4
101
  )
102
 
103
  def test_empty_operators_single_input_multirun(self):
104
+ X = self.rstate.randn(100, 1)
105
  y = X[:, 0] + 3.0
106
  regressor = PySRRegressor(
 
107
  unary_operators=[],
108
  binary_operators=["plus"],
109
  **self.default_test_kwargs,
 
129
  self.assertTrue("None" not in regressor.__repr__())
130
  self.assertTrue(">>>>" in regressor.__repr__())
131
 
 
 
 
132
  def test_noisy(self):
133
 
134
+ y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05
 
135
  model = PySRRegressor(
136
  # Test that passing a single operator works:
137
  unary_operators="sq(x) = x^2",
 
146
  self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
147
 
148
  def test_pandas_resample(self):
 
149
  X = pd.DataFrame(
150
  {
151
+ "T": self.rstate.randn(500),
152
+ "x": self.rstate.randn(500),
153
+ "unused_feature": self.rstate.randn(500),
154
  }
155
  )
156
  true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
157
  y = true_fn(X)
158
+ noise = self.rstate.randn(500) * 0.01
159
  y = y + noise
160
  # We also test y as a pandas array:
161
  y = pd.Series(y)
162
  # Resampled array is a different order of features:
163
  Xresampled = pd.DataFrame(
164
  {
165
+ "unused_feature": self.rstate.randn(100),
166
+ "x": self.rstate.randn(100),
167
+ "T": self.rstate.randn(100),
168
  }
169
  )
170
  model = PySRRegressor(
 
184
  self.assertListEqual(list(sorted(fn._selection)), [0, 1])
185
  X2 = pd.DataFrame(
186
  {
187
+ "T": self.rstate.randn(100),
188
+ "unused_feature": self.rstate.randn(100),
189
+ "x": self.rstate.randn(100),
190
  }
191
  )
192
  self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-1)
 
212
  variable_names="x0 x1".split(" "),
213
  extra_sympy_mappings={},
214
  output_jax_format=False,
215
+ model_selection="accuracy",
216
  )
217
  self.model.n_features = 2
218
  self.model.refresh()
219
  self.equations = self.model.equations
220
+ self.rstate = np.random.RandomState(0)
221
 
222
  def test_best(self):
223
  self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
 
232
  self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
233
 
234
  def test_best_lambda(self):
235
+ X = self.rstate.randn(10, 2)
236
  y = np.cos(X[:, 0]) ** 2
237
  for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]:
238
  np.testing.assert_almost_equal(f(X), y, decimal=4)
 
240
 
241
  class TestFeatureSelection(unittest.TestCase):
242
  def setUp(self):
243
+ self.rstate = np.random.RandomState(0)
244
 
245
  def test_feature_selection(self):
246
+ X = self.rstate.randn(20000, 5)
247
  y = X[:, 2] ** 2 + X[:, 3] ** 2
248
  selected = run_feature_selection(X, y, select_k_features=2)
249
  self.assertEqual(sorted(selected), [2, 3])
250
 
251
  def test_feature_selection_handler(self):
252
+ X = self.rstate.randn(20000, 5)
253
  y = X[:, 2] ** 2 + X[:, 3] ** 2
254
  var_names = [f"x{i}" for i in range(5)]
255
  selected_X, selection = _handle_feature_selection(