Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on Aug 10, 2022

Commit

1099283

unverified ·

2 Parent(s): 8da5000 9433a83

Merge pull request #167 from MilesCranmer/loading

Browse files

Files changed (5) hide show

.gitignore +1 -0
README.md +9 -1
pysr/sr.py +184 -12
test/test.py +89 -4
test/test_jax.py +1 -1

.gitignore CHANGED Viewed

@@ -3,6 +3,7 @@
 *.csv
 *.csv.out*
 *.bkup
 performance*txt
 *.out
 trials*

 *.csv
 *.csv.out*
 *.bkup
+*.pkl
 performance*txt
 *.out
 trials*

README.md CHANGED Viewed

@@ -162,7 +162,15 @@ This arrow in the `pick` column indicates which equation is currently selected b
 SymPy format (`sympy_format` - which you can also get with `model.sympy()`), and even JAX and PyTorch format
 (both of which are differentiable - which you can get with `model.jax()` and `model.pytorch()`).
-Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`. This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state.
 There are several other useful features such as denoising (e.g., `denoising=True`),
 feature selection (e.g., `select_k_features=3`).

 SymPy format (`sympy_format` - which you can also get with `model.sympy()`), and even JAX and PyTorch format
 (both of which are differentiable - which you can get with `model.jax()` and `model.pytorch()`).
+Note that `PySRRegressor` stores the state of the last search, and will restart from where you left off the next time you call `.fit()`, assuming you have set `warm_start=True`.
+This will cause problems if significant changes are made to the search parameters (like changing the operators). You can run `model.reset()` to reset the state.
+You will notice that PySR will save two files: `hall_of_fame...csv` and `hall_of_fame...pkl`.
+The csv file is a list of equations and their losses, and the pkl file is a saved state of the model.
+You may load the model from the `pkl` file with:
+```python
+model = PySRRegressor.from_file("hall_of_fame.2022-08-10_100832.281.pkl")
+```
 There are several other useful features such as denoising (e.g., `denoising=True`),
 feature selection (e.g., `select_k_features=3`).

pysr/sr.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import sys
 import numpy as np
@@ -8,6 +9,7 @@ import re
 import tempfile
 import shutil
 from pathlib import Path
 from datetime import datetime
 import warnings
 from multiprocessing import cpu_count
@@ -562,6 +564,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     equation_file_contents_ : list[pandas.DataFrame]
         Contents of the equation file output by the Julia backend.
     Notes
     -----
     Most default parameters have been tuned over several example equations,
@@ -805,6 +810,119 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                         f"{k} is not a valid keyword argument for PySRRegressor."
                     )
     def __repr__(self):
         """
         Prints all current equations fitted by the model.
@@ -873,17 +991,31 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         from the pickled instance.
         """
         state = self.__dict__
-        if "raw_julia_state_" in state:
             warnings.warn(
                 "raw_julia_state_ cannot be pickled and will be removed from the "
                 "serialized instance. This will prevent a `warm_start` fit of any "
                 "model that is deserialized via `pickle.load()`."
             )
         pickled_state = {
-            key: None if key == "raw_julia_state_" else value
             for key, value in state.items()
         }
-        if "equations_" in pickled_state:
             pickled_state["output_torch_format"] = False
             pickled_state["output_jax_format"] = False
             if self.nout_ == 1:
@@ -906,6 +1038,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 ]
         return pickled_state
     @property
     def equations(self):  # pragma: no cover
         warnings.warn(
@@ -1606,8 +1748,20 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             y,
         )
-        # Fitting procedure
-        return self._run(X, y, mutated_params, weights=weights, seed=seed)
     def refresh(self, checkpoint_file=None):
         """
@@ -1619,10 +1773,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         checkpoint_file : str, default=None
             Path to checkpoint hall of fame file to be loaded.
         """
-        check_is_fitted(self, attributes=["equation_file_"])
         if checkpoint_file:
             self.equation_file_ = checkpoint_file
             self.equation_file_contents_ = None
         self.equations_ = self.get_hof()
     def predict(self, X, index=None):
@@ -1812,10 +1966,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             if self.nout_ > 1:
                 all_outputs = []
                 for i in range(1, self.nout_ + 1):
-                    df = pd.read_csv(
-                        str(self.equation_file_) + f".out{i}" + ".bkup",
-                        sep="|",
-                    )
                     # Rename Complexity column to complexity:
                     df.rename(
                         columns={
@@ -1828,7 +1982,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                     all_outputs.append(df)
             else:
-                all_outputs = [pd.read_csv(str(self.equation_file_) + ".bkup", sep="|")]
                 all_outputs[-1].rename(
                     columns={
                         "Complexity": "complexity",
@@ -1886,7 +2043,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         ret_outputs = []
-        for output in self.equation_file_contents_:
             scores = []
             lastMSE = None
@@ -2035,3 +2194,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
         clf, threshold=-np.inf, max_features=select_k_features, prefit=True
     )
     return selector.get_support(indices=True)

+import copy
 import os
 import sys
 import numpy as np
 import tempfile
 import shutil
 from pathlib import Path
+import pickle as pkl
 from datetime import datetime
 import warnings
 from multiprocessing import cpu_count
     equation_file_contents_ : list[pandas.DataFrame]
         Contents of the equation file output by the Julia backend.
+    show_pickle_warnings_ : bool
+        Whether to show warnings about what attributes can be pickled.
     Notes
     -----
     Most default parameters have been tuned over several example equations,
                         f"{k} is not a valid keyword argument for PySRRegressor."
                     )
+    @classmethod
+    def from_file(
+        cls,
+        equation_file,
+        *,
+        binary_operators=None,
+        unary_operators=None,
+        n_features_in=None,
+        feature_names_in=None,
+        selection_mask=None,
+        nout=1,
+        **pysr_kwargs,
+    ):
+        """
+        Create a model from a saved model checkpoint or equation file.
+        Parameters
+        ----------
+        equation_file : str
+            Path to a pickle file containing a saved model, or a csv file
+            containing equations.
+        binary_operators : list[str]
+            The same binary operators used when creating the model.
+            Not needed if loading from a pickle file.
+        unary_operators : list[str]
+            The same unary operators used when creating the model.
+            Not needed if loading from a pickle file.
+        n_features_in : int
+            Number of features passed to the model.
+            Not needed if loading from a pickle file.
+        feature_names_in : list[str]
+            Names of the features passed to the model.
+            Not needed if loading from a pickle file.
+        selection_mask : list[bool]
+            If using select_k_features, you must pass `model.selection_mask_` here.
+            Not needed if loading from a pickle file.
+        nout : int, default=1
+            Number of outputs of the model.
+            Not needed if loading from a pickle file.
+        pysr_kwargs : dict
+            Any other keyword arguments to initialize the PySRRegressor object.
+            These will overwrite those stored in the pickle file.
+            Not needed if loading from a pickle file.
+        Returns
+        -------
+        model : PySRRegressor
+            The model with fitted equations.
+        """
+        if os.path.splitext(equation_file)[1] != ".pkl":
+            pkl_filename = _csv_filename_to_pkl_filename(equation_file)
+        else:
+            pkl_filename = equation_file
+        # Try to load model from <equation_file>.pkl
+        print(f"Checking if {pkl_filename} exists...")
+        if os.path.exists(pkl_filename):
+            print(f"Loading model from {pkl_filename}")
+            assert binary_operators is None
+            assert unary_operators is None
+            assert n_features_in is None
+            with open(pkl_filename, "rb") as f:
+                model = pkl.load(f)
+            # Update any parameters if necessary, such as
+            # extra_sympy_mappings:
+            model.set_params(**pysr_kwargs)
+            if "equations_" not in model.__dict__ or model.equations_ is None:
+                model.refresh()
+            return model
+        # Else, we re-create it.
+        print(
+            f"{equation_file} does not exist, "
+            "so we must create the model from scratch."
+        )
+        assert binary_operators is not None
+        assert unary_operators is not None
+        assert n_features_in is not None
+        # TODO: copy .bkup file if exists.
+        model = cls(
+            equation_file=equation_file,
+            binary_operators=binary_operators,
+            unary_operators=unary_operators,
+            **pysr_kwargs,
+        )
+        model.nout_ = nout
+        model.n_features_in_ = n_features_in
+        if feature_names_in is None:
+            model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)]
+        else:
+            assert len(feature_names_in) == n_features_in
+            model.feature_names_in_ = feature_names_in
+        if selection_mask is None:
+            model.selection_mask_ = np.ones(n_features_in, dtype=bool)
+        else:
+            model.selection_mask_ = selection_mask
+        model.refresh(checkpoint_file=equation_file)
+        return model
     def __repr__(self):
         """
         Prints all current equations fitted by the model.
         from the pickled instance.
         """
         state = self.__dict__
+        show_pickle_warning = not (
+            "show_pickle_warnings_" in state and not state["show_pickle_warnings_"]
+        )
+        if "raw_julia_state_" in state and show_pickle_warning:
             warnings.warn(
                 "raw_julia_state_ cannot be pickled and will be removed from the "
                 "serialized instance. This will prevent a `warm_start` fit of any "
                 "model that is deserialized via `pickle.load()`."
             )
+        state_keys_containing_lambdas = ["extra_sympy_mappings", "extra_torch_mappings"]
+        for state_key in state_keys_containing_lambdas:
+            if state[state_key] is not None and show_pickle_warning:
+                warnings.warn(
+                    f"`{state_key}` cannot be pickled and will be removed from the "
+                    "serialized instance. When loading the model, please redefine "
+                    f"`{state_key}` at runtime."
+                )
+        state_keys_to_clear = ["raw_julia_state_"] + state_keys_containing_lambdas
         pickled_state = {
+            key: (None if key in state_keys_to_clear else value)
             for key, value in state.items()
         }
+        if ("equations_" in pickled_state) and (
+            pickled_state["equations_"] is not None
+        ):
             pickled_state["output_torch_format"] = False
             pickled_state["output_jax_format"] = False
             if self.nout_ == 1:
                 ]
         return pickled_state
+    def _checkpoint(self):
+        """Saves the model's current state to a checkpoint file.
+        This should only be used internally by PySRRegressor."""
+        # Save model state:
+        self.show_pickle_warnings_ = False
+        with open(_csv_filename_to_pkl_filename(self.equation_file_), "wb") as f:
+            pkl.dump(self, f)
+        self.show_pickle_warnings_ = True
     @property
     def equations(self):  # pragma: no cover
         warnings.warn(
             y,
         )
+        # Initially, just save model parameters, so that
+        # it can be loaded from an early exit:
+        if not self.temp_equation_file:
+            self._checkpoint()
+        # Perform the search:
+        self._run(X, y, mutated_params, weights=weights, seed=seed)
+        # Then, after fit, we save again, so the pickle file contains
+        # the equations:
+        if not self.temp_equation_file:
+            self._checkpoint()
+        return self
     def refresh(self, checkpoint_file=None):
         """
         checkpoint_file : str, default=None
             Path to checkpoint hall of fame file to be loaded.
         """
         if checkpoint_file:
             self.equation_file_ = checkpoint_file
             self.equation_file_contents_ = None
+        check_is_fitted(self, attributes=["equation_file_"])
         self.equations_ = self.get_hof()
     def predict(self, X, index=None):
             if self.nout_ > 1:
                 all_outputs = []
                 for i in range(1, self.nout_ + 1):
+                    cur_filename = str(self.equation_file_) + f".out{i}" + ".bkup"
+                    if not os.path.exists(cur_filename):
+                        cur_filename = str(self.equation_file_) + f".out{i}"
+                    df = pd.read_csv(cur_filename, sep="|")
                     # Rename Complexity column to complexity:
                     df.rename(
                         columns={
                     all_outputs.append(df)
             else:
+                filename = str(self.equation_file_) + ".bkup"
+                if not os.path.exists(filename):
+                    filename = str(self.equation_file_)
+                all_outputs = [pd.read_csv(filename, sep="|")]
                 all_outputs[-1].rename(
                     columns={
                         "Complexity": "complexity",
         ret_outputs = []
+        equation_file_contents = copy.deepcopy(self.equation_file_contents_)
+        for output in equation_file_contents:
             scores = []
             lastMSE = None
         clf, threshold=-np.inf, max_features=select_k_features, prefit=True
     )
     return selector.get_support(indices=True)
+def _csv_filename_to_pkl_filename(csv_filename) -> str:
+    # Assume that the csv filename is of the form "foo.csv"
+    assert str(csv_filename).endswith(".csv")
+    dirname = str(os.path.dirname(csv_filename))
+    basename = str(os.path.basename(csv_filename))
+    base = str(os.path.splitext(basename)[0])
+    pkl_basename = base + ".pkl"
+    return os.path.join(dirname, pkl_basename)

test/test.py CHANGED Viewed

@@ -5,13 +5,18 @@ import unittest
 import numpy as np
 from sklearn import model_selection
 from pysr import PySRRegressor
-from pysr.sr import run_feature_selection, _handle_feature_selection
 from sklearn.utils.estimator_checks import check_estimator
 import sympy
 import pandas as pd
 import warnings
 import pickle as pkl
 import tempfile
 DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
 DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
@@ -135,7 +140,7 @@ class TestPipeline(unittest.TestCase):
         # These tests are flaky, so don't fail test:
         try:
             np.testing.assert_almost_equal(
-                model.predict(X.copy())[:, 0], X[:, 0] ** 2, decimal=4
             )
         except AssertionError:
             print("Error in test_multioutput_weighted_with_callable_temp_equation")
@@ -144,7 +149,7 @@ class TestPipeline(unittest.TestCase):
         try:
             np.testing.assert_almost_equal(
-                model.predict(X.copy())[:, 1], X[:, 1] ** 2, decimal=4
             )
         except AssertionError:
             print("Error in test_multioutput_weighted_with_callable_temp_equation")
@@ -280,6 +285,72 @@ class TestPipeline(unittest.TestCase):
         model.fit(X.values, y.values, Xresampled=Xresampled.values)
         self.assertLess(np.average((model.predict(X.values) - y.values) ** 2), 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):
@@ -330,7 +401,7 @@ class TestBest(unittest.TestCase):
         X = self.X
         y = self.y
         for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
-            np.testing.assert_almost_equal(f(X), y, decimal=4)
 class TestFeatureSelection(unittest.TestCase):
@@ -364,6 +435,20 @@ class TestFeatureSelection(unittest.TestCase):
 class TestMiscellaneous(unittest.TestCase):
     """Test miscellaneous functions."""
     def test_deprecation(self):
         """Ensure that deprecation works as expected.

 import numpy as np
 from sklearn import model_selection
 from pysr import PySRRegressor
+from pysr.sr import (
+    run_feature_selection,
+    _handle_feature_selection,
+    _csv_filename_to_pkl_filename,
+)
 from sklearn.utils.estimator_checks import check_estimator
 import sympy
 import pandas as pd
 import warnings
 import pickle as pkl
 import tempfile
+from pathlib import Path
 DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
 DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
         # These tests are flaky, so don't fail test:
         try:
             np.testing.assert_almost_equal(
+                model.predict(X.copy())[:, 0], X[:, 0] ** 2, decimal=3
             )
         except AssertionError:
             print("Error in test_multioutput_weighted_with_callable_temp_equation")
         try:
             np.testing.assert_almost_equal(
+                model.predict(X.copy())[:, 1], X[:, 1] ** 2, decimal=3
             )
         except AssertionError:
             print("Error in test_multioutput_weighted_with_callable_temp_equation")
         model.fit(X.values, y.values, Xresampled=Xresampled.values)
         self.assertLess(np.average((model.predict(X.values) - y.values) ** 2), 1e-4)
+    def test_load_model(self):
+        """See if we can load a ran model from the equation file."""
+        csv_file_data = """
+        Complexity|MSE|Equation
+        1|0.19951081|1.9762075
+        3|0.12717344|(f0 + 1.4724599)
+        4|0.104823045|pow_abs(2.2683423, cos(f3))"""
+        # Strip the indents:
+        csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")])
+        for from_backup in [False, True]:
+            rand_dir = Path(tempfile.mkdtemp())
+            equation_filename = str(rand_dir / "equation.csv")
+            with open(equation_filename + (".bkup" if from_backup else ""), "w") as f:
+                f.write(csv_file_data)
+            model = PySRRegressor.from_file(
+                equation_filename,
+                n_features_in=5,
+                feature_names_in=["f0", "f1", "f2", "f3", "f4"],
+                binary_operators=["+", "*", "/", "-", "^"],
+                unary_operators=["cos"],
+            )
+            X = self.rstate.rand(100, 5)
+            y_truth = 2.2683423 ** np.cos(X[:, 3])
+            y_test = model.predict(X, 2)
+            np.testing.assert_allclose(y_truth, y_test)
+    def test_load_model_simple(self):
+        # Test that we can simply load a model from its equation file.
+        y = self.X[:, [0, 1]] ** 2
+        model = PySRRegressor(
+            # Test that passing a single operator works:
+            unary_operators="sq(x) = x^2",
+            binary_operators="plus",
+            extra_sympy_mappings={"sq": lambda x: x**2},
+            **self.default_test_kwargs,
+            procs=0,
+            denoise=True,
+            early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2",
+        )
+        rand_dir = Path(tempfile.mkdtemp())
+        equation_file = rand_dir / "equations.csv"
+        model.set_params(temp_equation_file=False)
+        model.set_params(equation_file=equation_file)
+        model.fit(self.X, y)
+        # lambda functions are removed from the pickling, so we need
+        # to pass it during the loading:
+        model2 = PySRRegressor.from_file(
+            model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
+        )
+        np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X))
+        # Try again, but using only the pickle file:
+        for file_to_delete in [str(equation_file), str(equation_file) + ".bkup"]:
+            if os.path.exists(file_to_delete):
+                os.remove(file_to_delete)
+        pickle_file = rand_dir / "equations.pkl"
+        model3 = PySRRegressor.from_file(
+            model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
+        )
+        np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
 class TestBest(unittest.TestCase):
     def setUp(self):
         X = self.X
         y = self.y
         for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
+            np.testing.assert_almost_equal(f(X), y, decimal=3)
 class TestFeatureSelection(unittest.TestCase):
 class TestMiscellaneous(unittest.TestCase):
     """Test miscellaneous functions."""
+    def test_csv_to_pkl_conversion(self):
+        """Test that csv filename to pkl filename works as expected."""
+        tmpdir = Path(tempfile.mkdtemp())
+        equation_file = tmpdir / "equations.389479384.28378374.csv"
+        expected_pkl_file = tmpdir / "equations.389479384.28378374.pkl"
+        # First, test inputting the paths:
+        test_pkl_file = _csv_filename_to_pkl_filename(equation_file)
+        self.assertEqual(test_pkl_file, str(expected_pkl_file))
+        # Next, test inputting the strings.
+        test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file))
+        self.assertEqual(test_pkl_file, str(expected_pkl_file))
     def test_deprecation(self):
         """Ensure that deprecation works as expected.

test/test_jax.py CHANGED Viewed

@@ -76,7 +76,7 @@ class TestJAX(unittest.TestCase):
         np.testing.assert_almost_equal(
             np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
             np.square(np.cos(X[:, 1])),  # Select feature 1
-            decimal=4,
         )
     def test_feature_selection_custom_operators(self):

         np.testing.assert_almost_equal(
             np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
             np.square(np.cos(X[:, 1])),  # Select feature 1
+            decimal=3,
         )
     def test_feature_selection_custom_operators(self):