Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Aug 10, 2022

Commit

35e6ab1

unverified ·

2 Parent(s): a15823e 73d0a98

Merge pull request #177 from MilesCranmer/improved-model-selection

Browse files

Change "best" model_selection to include a loss threshold

Files changed (2) hide show

pysr/sr.py +42 -23
test/test.py +15 -0

pysr/sr.py CHANGED Viewed

@@ -205,10 +205,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     Parameters
     ----------
     model_selection : str, default="best"
-        Model selection criterion. Can be 'accuracy' or 'best'.
-        `"accuracy"` selects the candidate model with the lowest loss
-        (highest accuracy). `"best"` selects the candidate model with
-        the lowest sum of normalized loss and complexity.
     binary_operators : list[str], default=["+", "-", "*", "/"]
         List of strings giving the binary operators in Julia's Base.
@@ -469,7 +477,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Whether to use a progress bar instead of printing to stdout.
     equation_file : str, default=None
-        Where to save the files (with `.csv` extension).
     temp_equation_file : bool, default=False
         Whether to put the hall of fame file in the temp directory.
@@ -943,12 +951,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         for i, equations in enumerate(all_equations):
             selected = ["" for _ in range(len(equations))]
-            if self.model_selection == "accuracy":
-                chosen_row = -1
-            elif self.model_selection == "best":
-                chosen_row = equations["score"].idxmax()
-            else:
-                raise NotImplementedError
             selected[chosen_row] = ">>>>"
             repr_equations = pd.DataFrame(
                 dict(
@@ -1091,18 +1094,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
             return self.equations_.iloc[index]
-        if self.model_selection == "accuracy":
-            if isinstance(self.equations_, list):
-                return [eq.iloc[-1] for eq in self.equations_]
-            return self.equations_.iloc[-1]
-        elif self.model_selection == "best":
-            if isinstance(self.equations_, list):
-                return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_]
-            return self.equations_.iloc[self.equations_["score"].idxmax()]
-        else:
-            raise NotImplementedError(
-                f"{self.model_selection} is not a valid model selection strategy."
-            )
     def _setup_equation_file(self):
         """
@@ -2149,6 +2148,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return ret_outputs[0]
 def _denoise(X, y, Xresampled=None, random_state=None):
     """Denoise the dataset using a Gaussian process"""
     from sklearn.gaussian_process import GaussianProcessRegressor

     Parameters
     ----------
     model_selection : str, default="best"
+        Model selection criterion when selecting a final expression from
+        the list of best expression at each complexity.
+        Can be 'accuracy', 'best', or 'score'.
+        - `"accuracy"` selects the candidate model with the lowest loss
+          (highest accuracy).
+        - `"score"` selects the candidate model with the highest score.
+          Score is defined as the negated derivative of the log-loss with
+          respect to complexity - if an expression has a much better
+          loss at a slightly higher complexity, it is preferred.
+        - `"best"` selects the candidate model with the highest score
+          among expressions with a loss better than at least 1.5x the
+          most accurate model.
     binary_operators : list[str], default=["+", "-", "*", "/"]
         List of strings giving the binary operators in Julia's Base.
         Whether to use a progress bar instead of printing to stdout.
     equation_file : str, default=None
+        Where to save the files (.csv extension).
     temp_equation_file : bool, default=False
         Whether to put the hall of fame file in the temp directory.
         for i, equations in enumerate(all_equations):
             selected = ["" for _ in range(len(equations))]
+            chosen_row = idx_model_selection(equations, self.model_selection)
             selected[chosen_row] = ">>>>"
             repr_equations = pd.DataFrame(
                 dict(
                 return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
             return self.equations_.iloc[index]
+        if isinstance(self.equations_, list):
+            return [
+                eq.iloc[idx_model_selection(eq, self.model_selection)]
+                for eq in self.equations_
+            ]
+        return self.equations_.iloc[
+            idx_model_selection(self.equations_, self.model_selection)
+        ]
     def _setup_equation_file(self):
         """
         return ret_outputs[0]
+def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
+    """
+    Return the index of the selected expression, given a dataframe of
+    equations and a model selection.
+    """
+    if model_selection == "accuracy":
+        chosen_idx = equations["loss"].idxmin()
+    elif model_selection == "best":
+        threshold = 1.5 * equations["loss"].min()
+        filtered_equations = equations.query(f"loss <= {threshold}")
+        chosen_idx = filtered_equations["score"].idxmax()
+    elif model_selection == "score":
+        chosen_idx = equations["score"].idxmax()
+    else:
+        raise NotImplementedError(
+            f"{model_selection} is not a valid model selection strategy."
+        )
+    return chosen_idx
 def _denoise(X, y, Xresampled=None, random_state=None):
     """Denoise the dataset using a Gaussian process"""
     from sklearn.gaussian_process import GaussianProcessRegressor

test/test.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pysr.sr import (
     run_feature_selection,
     _handle_feature_selection,
     _csv_filename_to_pkl_filename,
 )
 from sklearn.utils.estimator_checks import check_estimator
 import sympy
@@ -403,6 +404,20 @@ class TestBest(unittest.TestCase):
         for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
             np.testing.assert_almost_equal(f(X), y, decimal=3)
 class TestFeatureSelection(unittest.TestCase):
     def setUp(self):

     run_feature_selection,
     _handle_feature_selection,
     _csv_filename_to_pkl_filename,
+    idx_model_selection,
 )
 from sklearn.utils.estimator_checks import check_estimator
 import sympy
         for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
             np.testing.assert_almost_equal(f(X), y, decimal=3)
+    def test_all_selection_strategies(self):
+        equations = pd.DataFrame(
+            dict(
+                loss=[1.0, 0.1, 0.01, 0.001 * 1.4, 0.001],
+                score=[0.5, 1.0, 0.5, 0.5, 0.3],
+            )
+        )
+        idx_accuracy = idx_model_selection(equations, "accuracy")
+        self.assertEqual(idx_accuracy, 4)
+        idx_best = idx_model_selection(equations, "best")
+        self.assertEqual(idx_best, 3)
+        idx_score = idx_model_selection(equations, "score")
+        self.assertEqual(idx_score, 1)
 class TestFeatureSelection(unittest.TestCase):
     def setUp(self):