Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on Jan 28, 2022

Commit

af14165

1 Parent(s): a47d265

Update parts of test to use ScikitLearn interface

Browse files

Files changed (3) hide show

pysr/__init__.py +1 -2
pysr/sr.py +48 -19
test/test.py +33 -35

pysr/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .sr import (
     pysr,
-    get_hof,
     best,
     best_tex,
     best_callable,
@@ -11,4 +11,3 @@ from .sr import (
 from .feynman_problems import Problem, FeynmanProblem
 from .export_jax import sympy2jax
 from .export_torch import sympy2torch
-from .sklearn import PySRRegressor

 from .sr import (
     pysr,
+    PySRRegressor,
     best,
     best_tex,
     best_callable,
 from .feynman_problems import Problem, FeynmanProblem
 from .export_jax import sympy2jax
 from .export_torch import sympy2torch

pysr/sr.py CHANGED Viewed

@@ -179,24 +179,35 @@ def run_feature_selection(X, y, select_k_features):
     return selector.get_support(indices=True)
 def _escape_filename(filename):
     """Turns a file into a string representation with correctly escaped backslashes"""
     str_repr = str(filename)
     str_repr = str_repr.replace("\\", "\\\\")
     return str_repr
 def best(*args, **kwargs):
-    raise NotImplementedError("`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation.")
 def best_row(*args, **kwargs):
-    raise NotImplementedError("`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation.")
 def best_tex(*args, **kwargs):
-    raise NotImplementedError("`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation.")
 def best_callable(*args, **kwargs):
-    raise NotImplementedError("`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable.")
 def _denoise(X, y, Xresampled=None):
@@ -647,7 +658,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             "nout",
             "selection",
             "variable_names",
-            "julia_project"
         ]
     def __repr__(self):
@@ -668,9 +679,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             dict(
                 pick=selected,
                 score=equations["score"],
-                Equation=equations["Equation"],
-                MSE=equations["MSE"],
-                Complexity=equations["Complexity"],
             )
         )
         output += repr_equations.__repr__()
@@ -1036,15 +1047,33 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
         try:
             if self.multioutput:
-                all_outputs = [
-                    pd.read_csv(
                         str(self.equation_file) + f".out{i}" + ".bkup",
                         sep="|",
                     )
-                    for i in range(1, self.nout + 1)
-                ]
             else:
                 all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
         except FileNotFoundError:
             raise RuntimeError(
                 "Couldn't find equation file! The equation search likely exited before a single iteration completed."
@@ -1079,7 +1108,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
                 ]
             for _, eqn_row in output.iterrows():
-                eqn = sympify(eqn_row["Equation"], locals=local_sympy_mappings)
                 sympy_format.append(eqn)
                 # Numpy:
@@ -1113,8 +1142,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
                     )
                     torch_format.append(module)
-                curMSE = eqn_row["MSE"]
-                curComplexity = eqn_row["Complexity"]
                 if lastMSE is None:
                     cur_score = 0.0
@@ -1134,10 +1163,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
             output["sympy_format"] = sympy_format
             output["lambda_format"] = lambda_format
             output_cols = [
-                "Complexity",
-                "MSE",
                 "score",
-                "Equation",
                 "sympy_format",
                 "lambda_format",
             ]

     return selector.get_support(indices=True)
 def _escape_filename(filename):
     """Turns a file into a string representation with correctly escaped backslashes"""
     str_repr = str(filename)
     str_repr = str_repr.replace("\\", "\\\\")
     return str_repr
 def best(*args, **kwargs):
+    raise NotImplementedError(
+        "`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation."
+    )
 def best_row(*args, **kwargs):
+    raise NotImplementedError(
+        "`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation."
+    )
 def best_tex(*args, **kwargs):
+    raise NotImplementedError(
+        "`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation."
+    )
 def best_callable(*args, **kwargs):
+    raise NotImplementedError(
+        "`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable."
+    )
 def _denoise(X, y, Xresampled=None):
             "nout",
             "selection",
             "variable_names",
+            "julia_project",
         ]
     def __repr__(self):
             dict(
                 pick=selected,
                 score=equations["score"],
+                equation=equations["equation"],
+                loss=equations["loss"],
+                complexity=equations["complexity"],
             )
         )
         output += repr_equations.__repr__()
         try:
             if self.multioutput:
+                all_outputs = []
+                for i in range(1, self.nout + 1):
+                    df = pd.read_csv(
                         str(self.equation_file) + f".out{i}" + ".bkup",
                         sep="|",
                     )
+                    # Rename Complexity column to complexity:
+                    df.rename(
+                        columns={
+                            "Complexity": "complexity",
+                            "MSE": "loss",
+                            "Equation": "equation",
+                        },
+                        inplace=True,
+                    )
+                    all_outputs.append(df)
             else:
                 all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
+                all_outputs[-1].rename(
+                    columns={
+                        "Complexity": "complexity",
+                        "MSE": "loss",
+                        "Equation": "equation",
+                    },
+                    inplace=True,
+                )
         except FileNotFoundError:
             raise RuntimeError(
                 "Couldn't find equation file! The equation search likely exited before a single iteration completed."
                 ]
             for _, eqn_row in output.iterrows():
+                eqn = sympify(eqn_row["equation"], locals=local_sympy_mappings)
                 sympy_format.append(eqn)
                 # Numpy:
                     )
                     torch_format.append(module)
+                curMSE = eqn_row["loss"]
+                curComplexity = eqn_row["complexity"]
                 if lastMSE is None:
                     cur_score = 0.0
             output["sympy_format"] = sympy_format
             output["lambda_format"] = lambda_format
             output_cols = [
+                "complexity",
+                "loss",
                 "score",
+                "equation",
                 "sympy_format",
                 "lambda_format",
             ]

test/test.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import unittest
 from unittest.mock import patch
 import numpy as np
-from pysr import pysr, get_hof, best, best_tex, best_callable, best_row, PySRRegressor
-from pysr.sr import run_feature_selection, _handle_feature_selection, _yesno
 import sympy
 from sympy import lambdify
 import pandas as pd
@@ -21,32 +21,33 @@ class TestPipeline(unittest.TestCase):
     def test_linear_relation(self):
         y = self.X[:, 0]
-        equations = pysr(self.X, y, **self.default_test_kwargs)
-        print(equations)
-        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
     def test_multiprocessing(self):
         y = self.X[:, 0]
-        equations = pysr(
-            self.X, y, **self.default_test_kwargs, procs=2, multithreading=False
-        )
-        print(equations)
-        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
     def test_multioutput_custom_operator(self):
         y = self.X[:, [0, 1]] ** 2
-        equations = pysr(
-            self.X,
-            y,
             unary_operators=["sq(x) = x^2"],
-            binary_operators=["plus"],
             extra_sympy_mappings={"sq": lambda x: x ** 2},
             **self.default_test_kwargs,
             procs=0,
         )
         print(equations)
-        self.assertLessEqual(equations[0].iloc[-1]["MSE"], 1e-4)
-        self.assertLessEqual(equations[1].iloc[-1]["MSE"], 1e-4)
     def test_multioutput_weighted_with_callable_temp_equation(self):
         y = self.X[:, [0, 1]] ** 2
@@ -58,10 +59,7 @@ class TestPipeline(unittest.TestCase):
         y = (2 - w) * y
         # Thus, pysr needs to use the weights to find the right equation!
-        pysr(
-            self.X,
-            y,
-            weights=w,
             unary_operators=["sq(x) = x^2"],
             binary_operators=["plus"],
             extra_sympy_mappings={"sq": lambda x: x ** 2},
@@ -70,12 +68,13 @@ class TestPipeline(unittest.TestCase):
             temp_equation_file=True,
             delete_tempfiles=False,
         )
         np.testing.assert_almost_equal(
-            best_callable()[0](self.X), self.X[:, 0] ** 2, decimal=4
         )
         np.testing.assert_almost_equal(
-            best_callable()[1](self.X), self.X[:, 1] ** 2, decimal=4
         )
     def test_empty_operators_single_input_sklearn(self):
@@ -108,9 +107,7 @@ class TestPipeline(unittest.TestCase):
         np.random.seed(1)
         y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
-        equations = pysr(
-            self.X,
-            y,
             # Test that passing a single operator works:
             unary_operators="sq(x) = x^2",
             binary_operators="plus",
@@ -119,8 +116,9 @@ class TestPipeline(unittest.TestCase):
             procs=0,
             denoise=True,
         )
-        self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
-        self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
     def test_pandas_resample(self):
         np.random.seed(1)
@@ -143,9 +141,7 @@ class TestPipeline(unittest.TestCase):
                 "T": np.random.randn(100),
             }
         )
-        equations = pysr(
-            X,
-            y,
             unary_operators=[],
             binary_operators=["+", "*", "/", "-"],
             **self.default_test_kwargs,
@@ -153,11 +149,12 @@ class TestPipeline(unittest.TestCase):
             denoise=True,
             select_k_features=2,
         )
-        self.assertNotIn("unused_feature", best_tex())
-        self.assertIn("T", best_tex())
-        self.assertIn("x", best_tex())
-        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
-        fn = best_callable()
         self.assertListEqual(list(sorted(fn._selection)), [0, 1])
         X2 = pd.DataFrame(
             {
@@ -167,6 +164,7 @@ class TestPipeline(unittest.TestCase):
             }
         )
         self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
 class TestBest(unittest.TestCase):

 import unittest
 from unittest.mock import patch
 import numpy as np
+from pysr import PySRRegressor
+from pysr.sr import run_feature_selection, _handle_feature_selection
 import sympy
 from sympy import lambdify
 import pandas as pd
     def test_linear_relation(self):
         y = self.X[:, 0]
+        model = PySRRegressor(**self.default_test_kwargs)
+        model.fit(self.X, y)
+        model.set_params(model_selection="accuracy")
+        print(model.equations)
+        self.assertLessEqual(model.get_best()["loss"], 1e-4)
     def test_multiprocessing(self):
         y = self.X[:, 0]
+        model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False)
+        model.fit(self.X, y)
+        print(model.equations)
+        self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4)
     def test_multioutput_custom_operator(self):
         y = self.X[:, [0, 1]] ** 2
+        model = PySRRegressor(
             unary_operators=["sq(x) = x^2"],
             extra_sympy_mappings={"sq": lambda x: x ** 2},
+            binary_operators=["plus"],
             **self.default_test_kwargs,
             procs=0,
         )
+        model.fit(self.X, y)
+        equations = model.equations
         print(equations)
+        self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
+        self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)
     def test_multioutput_weighted_with_callable_temp_equation(self):
         y = self.X[:, [0, 1]] ** 2
         y = (2 - w) * y
         # Thus, pysr needs to use the weights to find the right equation!
+        model = PySRRegressor(
             unary_operators=["sq(x) = x^2"],
             binary_operators=["plus"],
             extra_sympy_mappings={"sq": lambda x: x ** 2},
             temp_equation_file=True,
             delete_tempfiles=False,
         )
+        model.fit(self.X, y, weights=w)
         np.testing.assert_almost_equal(
+            model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
         )
         np.testing.assert_almost_equal(
+            model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
         )
     def test_empty_operators_single_input_sklearn(self):
         np.random.seed(1)
         y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
+        model = PySRRegressor(
             # Test that passing a single operator works:
             unary_operators="sq(x) = x^2",
             binary_operators="plus",
             procs=0,
             denoise=True,
         )
+        model.fit(self.X, y)
+        self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
+        self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
     def test_pandas_resample(self):
         np.random.seed(1)
                 "T": np.random.randn(100),
             }
         )
+        model = PySRRegressor(
             unary_operators=[],
             binary_operators=["+", "*", "/", "-"],
             **self.default_test_kwargs,
             denoise=True,
             select_k_features=2,
         )
+        model.fit(X, y)
+        self.assertNotIn("unused_feature", model.latex())
+        self.assertIn("T", model.latex())
+        self.assertIn("x", model.latex())
+        self.assertLessEqual(model.get_best()["loss"], 1e-2)
+        fn = model.get_best()['lambda_format']
         self.assertListEqual(list(sorted(fn._selection)), [0, 1])
         X2 = pd.DataFrame(
             {
             }
         )
         self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
+        self.assertLess(np.average((model.predict(X2) - true_fn(X2)) ** 2), 1e-2)
 class TestBest(unittest.TestCase):