Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Jun 4, 2022

Commit

c3dc203

unverified ·

2 Parent(s): 5bbefa6 ad1c492

Merge pull request #146 from tttc3/refactor-PySRRegressor

Browse files

Files changed (8) hide show

pysr/__init__.py +1 -1
pysr/export_numpy.py +40 -0
pysr/julia_helpers.py +126 -0
pysr/sr.py +0 -0
pysr/version.py +2 -2
test/test.py +110 -33
test/test_jax.py +58 -11
test/test_torch.py +89 -20

pysr/__init__.py CHANGED Viewed

@@ -6,8 +6,8 @@ from .sr import (
     best_tex,
     best_callable,
     best_row,
-    install,
 )
 from .feynman_problems import Problem, FeynmanProblem
 from .export_jax import sympy2jax
 from .export_torch import sympy2torch

     best_tex,
     best_callable,
     best_row,
 )
+from .julia_helpers import install
 from .feynman_problems import Problem, FeynmanProblem
 from .export_jax import sympy2jax
 from .export_torch import sympy2torch

pysr/export_numpy.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Code for exporting discovered expressions to numpy"""
+import numpy as np
+import pandas as pd
+from sympy import lambdify
+import warnings
+class CallableEquation:
+    """Simple wrapper for numpy lambda functions built with sympy"""
+    def __init__(self, sympy_symbols, eqn, selection=None, variable_names=None):
+        self._sympy = eqn
+        self._sympy_symbols = sympy_symbols
+        self._selection = selection
+        self._variable_names = variable_names
+    def __repr__(self):
+        return f"PySRFunction(X=>{self._sympy})"
+    def __call__(self, X):
+        expected_shape = (X.shape[0],)
+        if isinstance(X, pd.DataFrame):
+            # Lambda function takes as argument:
+            return self._lambda(
+                **{k: X[k].values for k in self._variable_names}
+            ) * np.ones(expected_shape)
+        if self._selection is not None:
+            if X.shape[1] != len(self._selection):
+                warnings.warn(
+                    "`X` should be of shape (n_samples, len(self._selection)). "
+                    "Automatically filtering `X` to selection. "
+                    "Note: Filtered `X` column order may not match column order in fit "
+                    "this may lead to incorrect predictions and other errors."
+                )
+                X = X[:, self._selection]
+        return self._lambda(*X.T) * np.ones(expected_shape)
+    @property
+    def _lambda(self):
+        return lambdify(self._sympy_symbols, self._sympy)

pysr/julia_helpers.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""Functions for initializing the Julia environment and installing deps."""
+import warnings
+from pathlib import Path
+import os
+from .version import __version__, __symbolic_regression_jl_version__
+def install(julia_project=None, quiet=False):  # pragma: no cover
+    """
+    Install PyCall.jl and all required dependencies for SymbolicRegression.jl.
+    Also updates the local Julia registry.
+    """
+    import julia
+    julia.install(quiet=quiet)
+    julia_project, is_shared = _get_julia_project(julia_project)
+    Main = init_julia()
+    Main.eval("using Pkg")
+    io = "devnull" if quiet else "stderr"
+    io_arg = f"io={io}" if is_julia_version_greater_eq(Main, "1.6") else ""
+    # Can't pass IO to Julia call as it evaluates to PyObject, so just directly
+    # use Main.eval:
+    Main.eval(
+        f'Pkg.activate("{_escape_filename(julia_project)}", shared = Bool({int(is_shared)}), {io_arg})'
+    )
+    if is_shared:
+        # Install SymbolicRegression.jl:
+        _add_sr_to_julia_project(Main, io_arg)
+    Main.eval(f"Pkg.instantiate({io_arg})")
+    Main.eval(f"Pkg.precompile({io_arg})")
+    if not quiet:
+        warnings.warn(
+            "It is recommended to restart Python after installing PySR's dependencies,"
+            " so that the Julia environment is properly initialized."
+        )
+def import_error_string(julia_project=None):
+    s = """
+    Required dependencies are not installed or built.  Run the following code in the Python REPL:
+        >>> import pysr
+        >>> pysr.install()
+    """
+    if julia_project is not None:
+        s += f"""
+        Tried to activate project {julia_project} but failed."""
+    return s
+def _get_julia_project(julia_project):
+    if julia_project is None:
+        is_shared = True
+        julia_project = f"pysr-{__version__}"
+    else:
+        is_shared = False
+        julia_project = Path(julia_project)
+    return julia_project, is_shared
+def is_julia_version_greater_eq(Main, version="1.6"):
+    """Check if Julia version is greater than specified version."""
+    return Main.eval(f'VERSION >= v"{version}"')
+def init_julia():
+    """Initialize julia binary, turning off compiled modules if needed."""
+    from julia.core import JuliaInfo, UnsupportedPythonError
+    try:
+        info = JuliaInfo.load(julia="julia")
+    except FileNotFoundError:
+        env_path = os.environ["PATH"]
+        raise FileNotFoundError(
+            f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}",
+        )
+    if not info.is_pycall_built():
+        raise ImportError(import_error_string())
+    Main = None
+    try:
+        from julia import Main as _Main
+        Main = _Main
+    except UnsupportedPythonError:
+        # Static python binary, so we turn off pre-compiled modules.
+        from julia.core import Julia
+        jl = Julia(compiled_modules=False)
+        from julia import Main as _Main
+        Main = _Main
+    return Main
+def _add_sr_to_julia_project(Main, io_arg):
+    Main.sr_spec = Main.PackageSpec(
+        name="SymbolicRegression",
+        url="https://github.com/MilesCranmer/SymbolicRegression.jl",
+        rev="v" + __symbolic_regression_jl_version__,
+    )
+    Main.eval(f"Pkg.add(sr_spec, {io_arg})")
+    Main.clustermanagers_spec = Main.PackageSpec(
+        name="ClusterManagers",
+        url="https://github.com/JuliaParallel/ClusterManagers.jl",
+        rev="14e7302f068794099344d5d93f71979aaf4fbeb3",
+    )
+    Main.eval(f"Pkg.add(clustermanagers_spec, {io_arg})")
+def _escape_filename(filename):
+    """Turns a file into a string representation with correctly escaped backslashes"""
+    str_repr = str(filename)
+    str_repr = str_repr.replace("\\", "\\\\")
+    return str_repr

pysr/sr.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

pysr/version.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- __version__ = "0.8.7"
2	- __symbolic_regression_jl_version__ = "0.9.4"


1	+ __version__ = "0.9.0"
2	+ __symbolic_regression_jl_version__ = "0.9.6"

test/test.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import inspect
 import unittest
-from unittest.mock import patch
 import numpy as np
 from pysr import PySRRegressor
 from pysr.sr import run_feature_selection, _handle_feature_selection
 import sympy
-from sympy import lambdify
 import pandas as pd
 import warnings
@@ -21,6 +21,7 @@ class TestPipeline(unittest.TestCase):
             inspect.signature(PySRRegressor.__init__).parameters["populations"].default
         )
         self.default_test_kwargs = dict(
             model_selection="accuracy",
             niterations=default_niterations * 2,
             populations=default_populations * 2,
@@ -30,17 +31,25 @@ class TestPipeline(unittest.TestCase):
     def test_linear_relation(self):
         y = self.X[:, 0]
-        model = PySRRegressor(**self.default_test_kwargs)
         model.fit(self.X, y)
-        print(model.equations)
         self.assertLessEqual(model.get_best()["loss"], 1e-4)
     def test_multiprocessing(self):
         y = self.X[:, 0]
-        model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False)
         model.fit(self.X, y)
-        print(model.equations)
-        self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4)
     def test_multioutput_custom_operator_quiet_custom_complexity(self):
         y = self.X[:, [0, 1]] ** 2
@@ -55,11 +64,12 @@ class TestPipeline(unittest.TestCase):
             # Test custom operators with constraints:
             nested_constraints={"square_op": {"square_op": 3}},
             constraints={"square_op": 10},
         )
         model.fit(self.X, y)
-        equations = model.equations
         print(equations)
-        self.assertIn("square_op", model.equations[0].iloc[-1]["equation"])
         self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
         self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)
@@ -95,6 +105,7 @@ class TestPipeline(unittest.TestCase):
             procs=0,
             temp_equation_file=True,
             delete_tempfiles=False,
         )
         model.fit(X.copy(), y, weights=w)
@@ -117,27 +128,29 @@ class TestPipeline(unittest.TestCase):
             print("Model equations: ", model.sympy()[1])
             print("True equation: x1^2")
-    def test_empty_operators_single_input_multirun(self):
         X = self.rstate.randn(100, 1)
         y = X[:, 0] + 3.0
         regressor = PySRRegressor(
             unary_operators=[],
             binary_operators=["plus"],
             **self.default_test_kwargs,
         )
         self.assertTrue("None" in regressor.__repr__())
         regressor.fit(X, y)
         self.assertTrue("None" not in regressor.__repr__())
         self.assertTrue(">>>>" in regressor.__repr__())
-        self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
         np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
         # Test if repeated fit works:
-        regressor.set_params(niterations=0)
         regressor.fit(X, y)
-        self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
         np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
         # Tweak model selection:
@@ -157,7 +170,11 @@ class TestPipeline(unittest.TestCase):
             **self.default_test_kwargs,
             procs=0,
             denoise=True,
         )
         model.fit(self.X, y)
         self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
         self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
@@ -188,11 +205,11 @@ class TestPipeline(unittest.TestCase):
             unary_operators=[],
             binary_operators=["+", "*", "/", "-"],
             **self.default_test_kwargs,
-            Xresampled=Xresampled,
             denoise=True,
             nested_constraints={"/": {"+": 1, "-": 1}, "+": {"*": 4}},
         )
-        model.fit(X, y)
         self.assertNotIn("unused_feature", model.latex())
         self.assertIn("T", model.latex())
         self.assertIn("x", model.latex())
@@ -217,18 +234,31 @@ class TestPipeline(unittest.TestCase):
             unary_operators=["cos"],
             select_k_features=3,
             early_stop_condition=1e-4,  # Stop once most accurate equation is <1e-4 MSE
-            Xresampled=Xresampled,
             maxsize=12,
             **self.default_test_kwargs,
         )
-        model.fit(X, y)
         model.set_params(model_selection="accuracy")
-        model.predict(X)
         self.assertLess(np.average((model.predict(X) - y) ** 2), 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):
         equations = pd.DataFrame(
             {
                 "equation": ["1.0", "cos(x0)", "square(cos(x0))"],
@@ -241,17 +271,8 @@ class TestBest(unittest.TestCase):
             "equation_file.csv.bkup", sep="|"
         )
-        self.model = PySRRegressor(
-            equation_file="equation_file.csv",
-            variable_names="x0 x1".split(" "),
-            extra_sympy_mappings={},
-            output_jax_format=False,
-            model_selection="accuracy",
-        )
-        self.model.n_features = 2
         self.model.refresh()
-        self.equations = self.model.equations
-        self.rstate = np.random.RandomState(0)
     def test_best(self):
         self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
@@ -266,9 +287,9 @@ class TestBest(unittest.TestCase):
         self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
     def test_best_lambda(self):
-        X = self.rstate.randn(10, 2)
-        y = np.cos(X[:, 0]) ** 2
-        for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]:
             np.testing.assert_almost_equal(f(X), y, decimal=4)
@@ -308,12 +329,12 @@ class TestMiscellaneous(unittest.TestCase):
         This should give a warning, and sets the correct value.
         """
-        with self.assertWarns(UserWarning):
             model = PySRRegressor(fractionReplaced=0.2)
         # This is a deprecated parameter, so we should get a warning.
         # The correct value should be set:
-        self.assertEqual(model.params["fraction_replaced"], 0.2)
     def test_size_warning(self):
         """Ensure that a warning is given for a large input size."""
@@ -336,3 +357,59 @@ class TestMiscellaneous(unittest.TestCase):
             with self.assertRaises(Exception) as context:
                 model.fit(X, y)
             self.assertIn("with 10 features or more", str(context.exception))

 import inspect
 import unittest
 import numpy as np
+from sklearn import model_selection
 from pysr import PySRRegressor
 from pysr.sr import run_feature_selection, _handle_feature_selection
+from sklearn.utils.estimator_checks import check_estimator
 import sympy
 import pandas as pd
 import warnings
             inspect.signature(PySRRegressor.__init__).parameters["populations"].default
         )
         self.default_test_kwargs = dict(
+            progress=False,
             model_selection="accuracy",
             niterations=default_niterations * 2,
             populations=default_populations * 2,
     def test_linear_relation(self):
         y = self.X[:, 0]
+        model = PySRRegressor(
+            **self.default_test_kwargs,
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1",
+        )
         model.fit(self.X, y)
+        print(model.equations_)
         self.assertLessEqual(model.get_best()["loss"], 1e-4)
     def test_multiprocessing(self):
         y = self.X[:, 0]
+        model = PySRRegressor(
+            **self.default_test_kwargs,
+            procs=2,
+            multithreading=False,
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1",
+        )
         model.fit(self.X, y)
+        print(model.equations_)
+        self.assertLessEqual(model.equations_.iloc[-1]["loss"], 1e-4)
     def test_multioutput_custom_operator_quiet_custom_complexity(self):
         y = self.X[:, [0, 1]] ** 2
             # Test custom operators with constraints:
             nested_constraints={"square_op": {"square_op": 3}},
             constraints={"square_op": 10},
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3",
         )
         model.fit(self.X, y)
+        equations = model.equations_
         print(equations)
+        self.assertIn("square_op", model.equations_[0].iloc[-1]["equation"])
         self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
         self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)
             procs=0,
             temp_equation_file=True,
             delete_tempfiles=False,
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 2",
         )
         model.fit(X.copy(), y, weights=w)
             print("Model equations: ", model.sympy()[1])
             print("True equation: x1^2")
+    def test_empty_operators_single_input_warm_start(self):
         X = self.rstate.randn(100, 1)
         y = X[:, 0] + 3.0
         regressor = PySRRegressor(
             unary_operators=[],
             binary_operators=["plus"],
             **self.default_test_kwargs,
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 3",
         )
         self.assertTrue("None" in regressor.__repr__())
         regressor.fit(X, y)
         self.assertTrue("None" not in regressor.__repr__())
         self.assertTrue(">>>>" in regressor.__repr__())
+        self.assertLessEqual(regressor.equations_.iloc[-1]["loss"], 1e-4)
         np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
         # Test if repeated fit works:
+        regressor.set_params(niterations=0, warm_start=True, early_stop_condition=None)
+        # This should exit immediately, and use the old equations
         regressor.fit(X, y)
+        self.assertLessEqual(regressor.equations_.iloc[-1]["loss"], 1e-4)
         np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
         # Tweak model selection:
             **self.default_test_kwargs,
             procs=0,
             denoise=True,
+            early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2",
         )
+        # We expect in this case that the "best"
+        # equation should be the right one:
+        model.set_params(model_selection="best")
         model.fit(self.X, y)
         self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
         self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
             unary_operators=[],
             binary_operators=["+", "*", "/", "-"],
             **self.default_test_kwargs,
             denoise=True,
             nested_constraints={"/": {"+": 1, "-": 1}, "+": {"*": 4}},
+            early_stop_condition="stop_if(loss, complexity) = loss < 1e-3 && complexity == 7",
         )
+        model.fit(X, y, Xresampled=Xresampled)
         self.assertNotIn("unused_feature", model.latex())
         self.assertIn("T", model.latex())
         self.assertIn("x", model.latex())
             unary_operators=["cos"],
             select_k_features=3,
             early_stop_condition=1e-4,  # Stop once most accurate equation is <1e-4 MSE
             maxsize=12,
             **self.default_test_kwargs,
         )
         model.set_params(model_selection="accuracy")
+        model.fit(X, y, Xresampled=Xresampled)
         self.assertLess(np.average((model.predict(X) - y) ** 2), 1e-4)
+        # Again, but with numpy arrays:
+        model.fit(X.values, y.values, Xresampled=Xresampled.values)
+        self.assertLess(np.average((model.predict(X.values) - y.values) ** 2), 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):
+        self.rstate = np.random.RandomState(0)
+        self.X = self.rstate.randn(10, 2)
+        self.y = np.cos(self.X[:, 0]) ** 2
+        self.model = PySRRegressor(
+            progress=False,
+            niterations=1,
+            extra_sympy_mappings={},
+            output_jax_format=False,
+            model_selection="accuracy",
+            equation_file="equation_file.csv",
+        )
+        self.model.fit(self.X, self.y)
         equations = pd.DataFrame(
             {
                 "equation": ["1.0", "cos(x0)", "square(cos(x0))"],
             "equation_file.csv.bkup", sep="|"
         )
         self.model.refresh()
+        self.equations_ = self.model.equations_
     def test_best(self):
         self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
         self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
     def test_best_lambda(self):
+        X = self.X
+        y = self.y
+        for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
             np.testing.assert_almost_equal(f(X), y, decimal=4)
         This should give a warning, and sets the correct value.
         """
+        with self.assertWarns(FutureWarning):
             model = PySRRegressor(fractionReplaced=0.2)
         # This is a deprecated parameter, so we should get a warning.
         # The correct value should be set:
+        self.assertEqual(model.fraction_replaced, 0.2)
     def test_size_warning(self):
         """Ensure that a warning is given for a large input size."""
             with self.assertRaises(Exception) as context:
                 model.fit(X, y)
             self.assertIn("with 10 features or more", str(context.exception))
+    def test_deterministic_warnings(self):
+        """Ensure that warnings are given for determinism"""
+        model = PySRRegressor(random_state=0)
+        X = np.random.randn(100, 2)
+        y = np.random.randn(100)
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            with self.assertRaises(Exception) as context:
+                model.fit(X, y)
+            self.assertIn("`deterministic`", str(context.exception))
+    def test_deterministic_errors(self):
+        """Setting deterministic without random_state should error"""
+        model = PySRRegressor(deterministic=True)
+        X = np.random.randn(100, 2)
+        y = np.random.randn(100)
+        with self.assertRaises(ValueError):
+            model.fit(X, y)
+    def test_scikit_learn_compatibility(self):
+        """Test PySRRegressor compatibility with scikit-learn."""
+        model = PySRRegressor(
+            max_evals=1000,
+            verbosity=0,
+            progress=False,
+            random_state=0,
+            deterministic=True,
+            procs=0,
+            multithreading=False,
+            warm_start=False,
+        )  # Return early.
+        check_generator = check_estimator(model, generate_only=True)
+        exception_messages = []
+        for (_, check) in check_generator:
+            try:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    # To ensure an equation file is written for each output in
+                    # nout, set stop condition to niterations=1
+                    if check.func.__name__ == "check_regressor_multioutput":
+                        model.set_params(niterations=1, max_evals=None)
+                    else:
+                        model.set_params(max_evals=10000)
+                    check(model)
+                print("Passed", check.func.__name__)
+            except Exception as e:
+                error_message = str(e)
+                exception_messages.append(f"{check.func.__name__}: {error_message}\n")
+                print("Failed", check.func.__name__, "with:")
+                # Add a leading tab to error message, which
+                # might be multi-line:
+                print("\n".join([(" " * 4) + row for row in error_message.split("\n")]))
+        # If any checks failed don't let the test pass.
+        self.assertEqual([], exception_messages)

test/test_jax.py CHANGED Viewed

@@ -4,8 +4,8 @@ from pysr import sympy2jax, PySRRegressor
 import pandas as pd
 from jax import numpy as jnp
 from jax import random
-from jax import grad
 import sympy
 class TestJAX(unittest.TestCase):
@@ -21,8 +21,16 @@ class TestJAX(unittest.TestCase):
         f, params = sympy2jax(cosx, [x, y, z])
         self.assertTrue(jnp.all(jnp.isclose(f(X, params), true)).item())
-    def test_pipeline(self):
-        X = np.random.randn(100, 10)
         equations = pd.DataFrame(
             {
                 "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
@@ -35,16 +43,34 @@ class TestJAX(unittest.TestCase):
             "equation_file.csv.bkup", sep="|"
         )
-        model = PySRRegressor(
-            equation_file="equation_file.csv",
-            output_jax_format=True,
-            variable_names="x1 x2 x3".split(" "),
         )
-        model.selection = [1, 2, 3]
-        model.n_features = 3
-        model.using_pandas = False
-        model.refresh()
         jformat = model.jax()
         np.testing.assert_almost_equal(
@@ -52,3 +78,24 @@ class TestJAX(unittest.TestCase):
             np.square(np.cos(X[:, 1])),  # Select feature 1
             decimal=4,
         )

 import pandas as pd
 from jax import numpy as jnp
 from jax import random
 import sympy
+from functools import partial
 class TestJAX(unittest.TestCase):
         f, params = sympy2jax(cosx, [x, y, z])
         self.assertTrue(jnp.all(jnp.isclose(f(X, params), true)).item())
+    def test_pipeline_pandas(self):
+        X = pd.DataFrame(np.random.randn(100, 10))
+        y = np.ones(X.shape[0])
+        model = PySRRegressor(
+            progress=False,
+            max_evals=10000,
+            output_jax_format=True,
+        )
+        model.fit(X, y)
         equations = pd.DataFrame(
             {
                 "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
             "equation_file.csv.bkup", sep="|"
         )
+        model.refresh(checkpoint_file="equation_file.csv")
+        jformat = model.jax()
+        np.testing.assert_almost_equal(
+            np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
+            np.square(np.cos(X.values[:, 1])),  # Select feature 1
+            decimal=4,
         )
+    def test_pipeline(self):
+        X = np.random.randn(100, 10)
+        y = np.ones(X.shape[0])
+        model = PySRRegressor(progress=False, max_evals=10000, output_jax_format=True)
+        model.fit(X, y)
+        equations = pd.DataFrame(
+            {
+                "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
+                "MSE": [1.0, 0.1, 1e-5],
+                "Complexity": [1, 2, 3],
+            }
+        )
+        equations["Complexity MSE Equation".split(" ")].to_csv(
+            "equation_file.csv.bkup", sep="|"
+        )
+        model.refresh(checkpoint_file="equation_file.csv")
         jformat = model.jax()
         np.testing.assert_almost_equal(
             np.square(np.cos(X[:, 1])),  # Select feature 1
             decimal=4,
         )
+    def test_feature_selection(self):
+        X = pd.DataFrame({f"k{i}": np.random.randn(1000) for i in range(10, 21)})
+        y = X["k15"] ** 2 + np.cos(X["k20"])
+        model = PySRRegressor(
+            progress=False,
+            unary_operators=["cos"],
+            select_k_features=3,
+            early_stop_condition=1e-5,
+        )
+        model.fit(X.values, y.values)
+        f, parameters = model.jax().values()
+        np_prediction = model.predict
+        jax_prediction = partial(f, parameters=parameters)
+        np_output = np_prediction(X.values)
+        jax_output = jax_prediction(X.values)
+        np.testing.assert_almost_equal(np_output, jax_output, decimal=4)

test/test_torch.py CHANGED Viewed

@@ -2,7 +2,20 @@ import unittest
 import numpy as np
 import pandas as pd
 from pysr import sympy2torch, PySRRegressor
-import torch
 import sympy
@@ -13,6 +26,7 @@ class TestTorch(unittest.TestCase):
     def test_sympy2torch(self):
         x, y, z = sympy.symbols("x y z")
         cosx = 1.0 * sympy.cos(x) + y
         X = torch.tensor(np.random.randn(1000, 3))
         true = 1.0 * torch.cos(X[:, 0]) + X[:, 1]
         torch_module = sympy2torch(cosx, [x, y, z])
@@ -20,8 +34,18 @@ class TestTorch(unittest.TestCase):
             np.all(np.isclose(torch_module(X).detach().numpy(), true.detach().numpy()))
         )
-    def test_pipeline(self):
-        X = np.random.randn(100, 10)
         equations = pd.DataFrame(
             {
                 "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
@@ -34,23 +58,47 @@ class TestTorch(unittest.TestCase):
             "equation_file.csv.bkup", sep="|"
         )
         model = PySRRegressor(
             model_selection="accuracy",
-            equation_file="equation_file.csv",
-            variable_names="x1 x2 x3".split(" "),
-            extra_sympy_mappings={},
             output_torch_format=True,
         )
-        model.selection = [1, 2, 3]
-        model.n_features = 2  # TODO: Why is this 2 and not 3?
-        model.using_pandas = False
-        model.refresh()
         tformat = model.pytorch()
         self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)")
         np.testing.assert_almost_equal(
             tformat(torch.tensor(X)).detach().numpy(),
-            np.square(np.cos(X[:, 1])),  # Selection 1st feature
             decimal=4,
         )
@@ -73,6 +121,14 @@ class TestTorch(unittest.TestCase):
     def test_custom_operator(self):
         X = np.random.randn(100, 3)
         equations = pd.DataFrame(
             {
@@ -86,18 +142,12 @@ class TestTorch(unittest.TestCase):
             "equation_file_custom_operator.csv.bkup", sep="|"
         )
-        model = PySRRegressor(
-            model_selection="accuracy",
             equation_file="equation_file_custom_operator.csv",
-            variable_names="x1 x2 x3".split(" "),
             extra_sympy_mappings={"mycustomoperator": sympy.sin},
             extra_torch_mappings={"mycustomoperator": torch.sin},
-            output_torch_format=True,
         )
-        model.selection = [0, 1, 2]
-        model.n_features = 3
-        model.using_pandas = False
-        model.refresh()
         self.assertEqual(str(model.sympy()), "sin(x1)")
         # Will automatically use the set global state from get_hof.
@@ -105,6 +155,25 @@ class TestTorch(unittest.TestCase):
         self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))")
         np.testing.assert_almost_equal(
             tformat(torch.tensor(X)).detach().numpy(),
-            np.sin(X[:, 0]),  # Selection 1st feature
             decimal=4,
         )

 import numpy as np
 import pandas as pd
 from pysr import sympy2torch, PySRRegressor
+# Need to initialize Julia before importing torch...
+import platform
+if platform.system() == "Darwin":
+    # Import PyJulia, then Torch
+    from pysr.julia_helpers import init_julia
+    Main = init_julia()
+    import torch
+else:
+    # Import Torch, then PyJulia
+    # https://github.com/pytorch/pytorch/issues/78829
+    import torch
 import sympy
     def test_sympy2torch(self):
         x, y, z = sympy.symbols("x y z")
         cosx = 1.0 * sympy.cos(x) + y
         X = torch.tensor(np.random.randn(1000, 3))
         true = 1.0 * torch.cos(X[:, 0]) + X[:, 1]
         torch_module = sympy2torch(cosx, [x, y, z])
             np.all(np.isclose(torch_module(X).detach().numpy(), true.detach().numpy()))
         )
+    def test_pipeline_pandas(self):
+        X = pd.DataFrame(np.random.randn(100, 10))
+        y = np.ones(X.shape[0])
+        model = PySRRegressor(
+            progress=False,
+            max_evals=10000,
+            model_selection="accuracy",
+            extra_sympy_mappings={},
+            output_torch_format=True,
+        )
+        model.fit(X, y)
         equations = pd.DataFrame(
             {
                 "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
             "equation_file.csv.bkup", sep="|"
         )
+        model.refresh(checkpoint_file="equation_file.csv")
+        tformat = model.pytorch()
+        self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)")
+        np.testing.assert_almost_equal(
+            tformat(torch.tensor(X.values)).detach().numpy(),
+            np.square(np.cos(X.values[:, 1])),  # Selection 1st feature
+            decimal=4,
+        )
+    def test_pipeline(self):
+        X = np.random.randn(100, 10)
+        y = np.ones(X.shape[0])
         model = PySRRegressor(
+            progress=False,
+            max_evals=10000,
             model_selection="accuracy",
             output_torch_format=True,
         )
+        model.fit(X, y)
+        equations = pd.DataFrame(
+            {
+                "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
+                "MSE": [1.0, 0.1, 1e-5],
+                "Complexity": [1, 2, 3],
+            }
+        )
+        equations["Complexity MSE Equation".split(" ")].to_csv(
+            "equation_file.csv.bkup", sep="|"
+        )
+        model.refresh(checkpoint_file="equation_file.csv")
         tformat = model.pytorch()
         self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)")
         np.testing.assert_almost_equal(
             tformat(torch.tensor(X)).detach().numpy(),
+            np.square(np.cos(X[:, 1])),  # 2nd feature
             decimal=4,
         )
     def test_custom_operator(self):
         X = np.random.randn(100, 3)
+        y = np.ones(X.shape[0])
+        model = PySRRegressor(
+            progress=False,
+            max_evals=10000,
+            model_selection="accuracy",
+            output_torch_format=True,
+        )
+        model.fit(X, y)
         equations = pd.DataFrame(
             {
             "equation_file_custom_operator.csv.bkup", sep="|"
         )
+        model.set_params(
             equation_file="equation_file_custom_operator.csv",
             extra_sympy_mappings={"mycustomoperator": sympy.sin},
             extra_torch_mappings={"mycustomoperator": torch.sin},
         )
+        model.refresh(checkpoint_file="equation_file_custom_operator.csv")
         self.assertEqual(str(model.sympy()), "sin(x1)")
         # Will automatically use the set global state from get_hof.
         self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))")
         np.testing.assert_almost_equal(
             tformat(torch.tensor(X)).detach().numpy(),
+            np.sin(X[:, 1]),
             decimal=4,
         )
+    def test_feature_selection(self):
+        X = pd.DataFrame({f"k{i}": np.random.randn(1000) for i in range(10, 21)})
+        y = X["k15"] ** 2 + np.cos(X["k20"])
+        model = PySRRegressor(
+            progress=False,
+            unary_operators=["cos"],
+            select_k_features=3,
+            early_stop_condition=1e-5,
+        )
+        model.fit(X.values, y.values)
+        torch_module = model.pytorch()
+        np_output = model.predict(X.values)
+        torch_output = torch_module(torch.tensor(X.values)).detach().numpy()
+        np.testing.assert_almost_equal(np_output, torch_output, decimal=4)