MilesCranmer commited on
Commit
4424b0a
2 Parent(s): bae75db 7ab053d

Merge pull request #95 from MilesCranmer/state-saving

Browse files
Files changed (5) hide show
  1. README.md +3 -1
  2. docs/start.md +3 -1
  3. pysr/sr.py +46 -5
  4. setup.py +2 -2
  5. test/test.py +8 -1
README.md CHANGED
@@ -74,7 +74,7 @@ Most common issues at this stage are solved
74
  by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
75
  to use up-to-date packages.
76
 
77
- # Quickstart
78
 
79
  Let's create a PySR example. First, let's import
80
  numpy to generate some test data:
@@ -144,6 +144,8 @@ This arrow in the `pick` column indicates which equation is currently selected b
144
  SymPy format (`sympy_format`), and even JAX and PyTorch format
145
  (both of which are differentiable).
146
 
 
 
147
  There are several other useful features such as denoising (e.g., `denoising=True`),
148
  feature selection (e.g., `select_k_features=3`).
149
  For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).
 
74
  by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
75
  to use up-to-date packages.
76
 
77
+ # Introduction
78
 
79
  Let's create a PySR example. First, let's import
80
  numpy to generate some test data:
 
144
  SymPy format (`sympy_format`), and even JAX and PyTorch format
145
  (both of which are differentiable).
146
 
147
+ Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`. This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state.
148
+
149
  There are several other useful features such as denoising (e.g., `denoising=True`),
150
  feature selection (e.g., `select_k_features=3`).
151
  For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).
docs/start.md CHANGED
@@ -19,7 +19,7 @@ Most common issues at this stage are solved
19
  by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
20
  to use up-to-date packages.
21
 
22
- # Quickstart
23
 
24
  Let's create a PySR example. First, let's import
25
  numpy to generate some test data:
@@ -89,6 +89,8 @@ This arrow in the `pick` column indicates which equation is currently selected b
89
  SymPy format (`sympy_format`), and even JAX and PyTorch format
90
  (both of which are differentiable).
91
 
 
 
92
  There are several other useful features such as denoising (e.g., `denoising=True`),
93
  feature selection (e.g., `select_k_features=3`).
94
  For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).
 
19
  by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
20
  to use up-to-date packages.
21
 
22
+ # Introduction
23
 
24
  Let's create a PySR example. First, let's import
25
  numpy to generate some test data:
 
89
  SymPy format (`sympy_format`), and even JAX and PyTorch format
90
  (both of which are differentiable).
91
 
92
+ Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`. This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state.
93
+
94
  There are several other useful features such as denoising (e.g., `denoising=True`),
95
  feature selection (e.g., `select_k_features=3`).
96
  For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).
pysr/sr.py CHANGED
@@ -12,6 +12,8 @@ from datetime import datetime
12
  import warnings
13
  from multiprocessing import cpu_count
14
  from sklearn.base import BaseEstimator, RegressorMixin
 
 
15
 
16
  is_julia_warning_silenced = False
17
 
@@ -320,7 +322,7 @@ def _write_project_file(tmp_dir):
320
  SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
321
 
322
  [compat]
323
- SymbolicRegression = "0.7.0"
324
  julia = "1.5"
325
  """
326
 
@@ -636,9 +638,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
636
 
637
  # Stored equations:
638
  self.equations = None
 
 
639
 
640
  self.multioutput = None
641
- self.raw_julia_output = None
642
  self.equation_file = equation_file
643
  self.n_features = None
644
  self.extra_sympy_mappings = extra_sympy_mappings
@@ -654,7 +657,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
654
  self.surface_parameters = [
655
  "model_selection",
656
  "multioutput",
657
- "raw_julia_output",
658
  "equation_file",
659
  "n_features",
660
  "extra_sympy_mappings",
@@ -727,7 +729,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
727
  else:
728
  self.params[key] = value
729
 
730
- self.refresh()
731
  return self
732
 
733
  def get_params(self, deep=True):
@@ -858,6 +859,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
858
  return [eq["torch_format"] for eq in best]
859
  return best["torch_format"]
860
 
 
 
 
 
 
 
861
  def _run(self, X, y, weights, variable_names):
862
  global already_ran
863
  global Main
@@ -1046,6 +1053,38 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1046
  float(weightDoNothing),
1047
  ]
1048
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  options = Main.Options(
1050
  binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
1051
  unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
@@ -1085,6 +1124,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1085
  optimizer_iterations=self.params["optimizer_iterations"],
1086
  perturbationFactor=self.params["perturbationFactor"],
1087
  annealing=self.params["annealing"],
 
1088
  )
1089
 
1090
  np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[
@@ -1106,7 +1146,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1106
 
1107
  cprocs = 0 if multithreading else procs
1108
 
1109
- self.raw_julia_output = Main.EquationSearch(
1110
  Main.X,
1111
  Main.y,
1112
  weights=Main.weights,
@@ -1119,6 +1159,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
1119
  options=options,
1120
  numprocs=int(cprocs),
1121
  multithreading=bool(multithreading),
 
1122
  )
1123
 
1124
  self.variable_names = variable_names
 
12
  import warnings
13
  from multiprocessing import cpu_count
14
  from sklearn.base import BaseEstimator, RegressorMixin
15
+ from collections import OrderedDict
16
+ from hashlib import sha256
17
 
18
  is_julia_warning_silenced = False
19
 
 
322
  SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
323
 
324
  [compat]
325
+ SymbolicRegression = "0.7.3"
326
  julia = "1.5"
327
  """
328
 
 
638
 
639
  # Stored equations:
640
  self.equations = None
641
+ self.params_hash = None
642
+ self.raw_julia_state = None
643
 
644
  self.multioutput = None
 
645
  self.equation_file = equation_file
646
  self.n_features = None
647
  self.extra_sympy_mappings = extra_sympy_mappings
 
657
  self.surface_parameters = [
658
  "model_selection",
659
  "multioutput",
 
660
  "equation_file",
661
  "n_features",
662
  "extra_sympy_mappings",
 
729
  else:
730
  self.params[key] = value
731
 
 
732
  return self
733
 
734
  def get_params(self, deep=True):
 
859
  return [eq["torch_format"] for eq in best]
860
  return best["torch_format"]
861
 
862
+ def reset(self):
863
+ """Reset the search state."""
864
+ self.equations = None
865
+ self.params_hash = None
866
+ self.raw_julia_state = None
867
+
868
  def _run(self, X, y, weights, variable_names):
869
  global already_ran
870
  global Main
 
1053
  float(weightDoNothing),
1054
  ]
1055
 
1056
+ params_to_hash = {
1057
+ **{k: self.__getattribute__(k) for k in self.surface_parameters},
1058
+ **self.params,
1059
+ }
1060
+ params_excluded_from_hash = [
1061
+ "niterations",
1062
+ ]
1063
+ # Delete these^ from params_to_hash:
1064
+ params_to_hash = {
1065
+ k: v
1066
+ for k, v in params_to_hash.items()
1067
+ if k not in params_excluded_from_hash
1068
+ }
1069
+
1070
+ # Sort params_to_hash by key:
1071
+ params_to_hash = OrderedDict(sorted(params_to_hash.items()))
1072
+ # Hash all parameters:
1073
+ cur_hash = sha256(str(params_to_hash).encode()).hexdigest()
1074
+
1075
+ if self.params_hash is not None:
1076
+ if cur_hash != self.params_hash:
1077
+ warnings.warn(
1078
+ "Warning: PySR options have changed since the last run. "
1079
+ "This is experimental and may not work. "
1080
+ "For example, if the operators change, or even their order,"
1081
+ " the saved equations will be in the wrong format."
1082
+ "\n\n"
1083
+ "To reset the search state, run `.reset()`. "
1084
+ )
1085
+
1086
+ self.params_hash = cur_hash
1087
+
1088
  options = Main.Options(
1089
  binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
1090
  unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
 
1124
  optimizer_iterations=self.params["optimizer_iterations"],
1125
  perturbationFactor=self.params["perturbationFactor"],
1126
  annealing=self.params["annealing"],
1127
+ stateReturn=True, # Required for state saving.
1128
  )
1129
 
1130
  np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[
 
1146
 
1147
  cprocs = 0 if multithreading else procs
1148
 
1149
+ self.raw_julia_state = Main.EquationSearch(
1150
  Main.X,
1151
  Main.y,
1152
  weights=Main.weights,
 
1159
  options=options,
1160
  numprocs=int(cprocs),
1161
  multithreading=bool(multithreading),
1162
+ saved_state=self.raw_julia_state,
1163
  )
1164
 
1165
  self.variable_names = variable_names
setup.py CHANGED
@@ -8,14 +8,14 @@ except FileNotFoundError:
8
 
9
  setuptools.setup(
10
  name="pysr",
11
- version="0.7.0",
12
  author="Miles Cranmer",
13
  author_email="[email protected]",
14
  description="Simple and efficient symbolic regression",
15
  long_description=long_description,
16
  long_description_content_type="text/markdown",
17
  url="https://github.com/MilesCranmer/pysr",
18
- install_requires=["julia", "numpy", "pandas", "sympy", "scikit-learn"],
19
  packages=setuptools.find_packages(),
20
  package_data={"pysr": ["../Project.toml", "../datasets/*"]},
21
  include_package_data=False,
 
8
 
9
  setuptools.setup(
10
  name="pysr",
11
+ version="0.7.0-1",
12
  author="Miles Cranmer",
13
  author_email="[email protected]",
14
  description="Simple and efficient symbolic regression",
15
  long_description=long_description,
16
  long_description_content_type="text/markdown",
17
  url="https://github.com/MilesCranmer/pysr",
18
+ install_requires=["julia>=0.5.7", "numpy", "pandas", "sympy", "scikit-learn"],
19
  packages=setuptools.find_packages(),
20
  package_data={"pysr": ["../Project.toml", "../datasets/*"]},
21
  include_package_data=False,
test/test.py CHANGED
@@ -77,7 +77,7 @@ class TestPipeline(unittest.TestCase):
77
  model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
78
  )
79
 
80
- def test_empty_operators_single_input_sklearn(self):
81
  X = np.random.randn(100, 1)
82
  y = X[:, 0] + 3.0
83
  regressor = PySRRegressor(
@@ -94,6 +94,13 @@ class TestPipeline(unittest.TestCase):
94
  self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
95
  np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
96
 
 
 
 
 
 
 
 
97
  # Tweak model selection:
98
  regressor.set_params(model_selection="best")
99
  self.assertEqual(regressor.get_params()["model_selection"], "best")
 
77
  model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
78
  )
79
 
80
+ def test_empty_operators_single_input_multirun(self):
81
  X = np.random.randn(100, 1)
82
  y = X[:, 0] + 3.0
83
  regressor = PySRRegressor(
 
94
  self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
95
  np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
96
 
97
+ # Test if repeated fit works:
98
+ regressor.set_params(niterations=0)
99
+ regressor.fit(X, y)
100
+
101
+ self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
102
+ np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
103
+
104
  # Tweak model selection:
105
  regressor.set_params(model_selection="best")
106
  self.assertEqual(regressor.get_params()["model_selection"], "best")