MilesCranmer commited on
Commit
35e6ab1
2 Parent(s): a15823e 73d0a98

Merge pull request #177 from MilesCranmer/improved-model-selection

Browse files

Change "best" model_selection to include a loss threshold

Files changed (2) hide show
  1. pysr/sr.py +42 -23
  2. test/test.py +15 -0
pysr/sr.py CHANGED
@@ -205,10 +205,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
205
  Parameters
206
  ----------
207
  model_selection : str, default="best"
208
- Model selection criterion. Can be 'accuracy' or 'best'.
209
- `"accuracy"` selects the candidate model with the lowest loss
210
- (highest accuracy). `"best"` selects the candidate model with
211
- the lowest sum of normalized loss and complexity.
 
 
 
 
 
 
 
 
212
 
213
  binary_operators : list[str], default=["+", "-", "*", "/"]
214
  List of strings giving the binary operators in Julia's Base.
@@ -469,7 +477,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
469
  Whether to use a progress bar instead of printing to stdout.
470
 
471
  equation_file : str, default=None
472
- Where to save the files (with `.csv` extension).
473
 
474
  temp_equation_file : bool, default=False
475
  Whether to put the hall of fame file in the temp directory.
@@ -943,12 +951,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
943
 
944
  for i, equations in enumerate(all_equations):
945
  selected = ["" for _ in range(len(equations))]
946
- if self.model_selection == "accuracy":
947
- chosen_row = -1
948
- elif self.model_selection == "best":
949
- chosen_row = equations["score"].idxmax()
950
- else:
951
- raise NotImplementedError
952
  selected[chosen_row] = ">>>>"
953
  repr_equations = pd.DataFrame(
954
  dict(
@@ -1091,18 +1094,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1091
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1092
  return self.equations_.iloc[index]
1093
 
1094
- if self.model_selection == "accuracy":
1095
- if isinstance(self.equations_, list):
1096
- return [eq.iloc[-1] for eq in self.equations_]
1097
- return self.equations_.iloc[-1]
1098
- elif self.model_selection == "best":
1099
- if isinstance(self.equations_, list):
1100
- return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_]
1101
- return self.equations_.iloc[self.equations_["score"].idxmax()]
1102
- else:
1103
- raise NotImplementedError(
1104
- f"{self.model_selection} is not a valid model selection strategy."
1105
- )
1106
 
1107
  def _setup_equation_file(self):
1108
  """
@@ -2149,6 +2148,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2149
  return ret_outputs[0]
2150
 
2151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2152
  def _denoise(X, y, Xresampled=None, random_state=None):
2153
  """Denoise the dataset using a Gaussian process"""
2154
  from sklearn.gaussian_process import GaussianProcessRegressor
 
205
  Parameters
206
  ----------
207
  model_selection : str, default="best"
208
+ Model selection criterion when selecting a final expression from
209
+ the list of best expression at each complexity.
210
+ Can be 'accuracy', 'best', or 'score'.
211
+ - `"accuracy"` selects the candidate model with the lowest loss
212
+ (highest accuracy).
213
+ - `"score"` selects the candidate model with the highest score.
214
+ Score is defined as the negated derivative of the log-loss with
215
+ respect to complexity - if an expression has a much better
216
+ loss at a slightly higher complexity, it is preferred.
217
+ - `"best"` selects the candidate model with the highest score
218
+ among expressions with a loss better than at least 1.5x the
219
+ most accurate model.
220
 
221
  binary_operators : list[str], default=["+", "-", "*", "/"]
222
  List of strings giving the binary operators in Julia's Base.
 
477
  Whether to use a progress bar instead of printing to stdout.
478
 
479
  equation_file : str, default=None
480
+ Where to save the files (.csv extension).
481
 
482
  temp_equation_file : bool, default=False
483
  Whether to put the hall of fame file in the temp directory.
 
951
 
952
  for i, equations in enumerate(all_equations):
953
  selected = ["" for _ in range(len(equations))]
954
+ chosen_row = idx_model_selection(equations, self.model_selection)
 
 
 
 
 
955
  selected[chosen_row] = ">>>>"
956
  repr_equations = pd.DataFrame(
957
  dict(
 
1094
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1095
  return self.equations_.iloc[index]
1096
 
1097
+ if isinstance(self.equations_, list):
1098
+ return [
1099
+ eq.iloc[idx_model_selection(eq, self.model_selection)]
1100
+ for eq in self.equations_
1101
+ ]
1102
+ return self.equations_.iloc[
1103
+ idx_model_selection(self.equations_, self.model_selection)
1104
+ ]
 
 
 
 
1105
 
1106
  def _setup_equation_file(self):
1107
  """
 
2148
  return ret_outputs[0]
2149
 
2150
 
2151
+ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
2152
+ """
2153
+ Return the index of the selected expression, given a dataframe of
2154
+ equations and a model selection.
2155
+ """
2156
+ if model_selection == "accuracy":
2157
+ chosen_idx = equations["loss"].idxmin()
2158
+ elif model_selection == "best":
2159
+ threshold = 1.5 * equations["loss"].min()
2160
+ filtered_equations = equations.query(f"loss <= {threshold}")
2161
+ chosen_idx = filtered_equations["score"].idxmax()
2162
+ elif model_selection == "score":
2163
+ chosen_idx = equations["score"].idxmax()
2164
+ else:
2165
+ raise NotImplementedError(
2166
+ f"{model_selection} is not a valid model selection strategy."
2167
+ )
2168
+ return chosen_idx
2169
+
2170
+
2171
  def _denoise(X, y, Xresampled=None, random_state=None):
2172
  """Denoise the dataset using a Gaussian process"""
2173
  from sklearn.gaussian_process import GaussianProcessRegressor
test/test.py CHANGED
@@ -9,6 +9,7 @@ from pysr.sr import (
9
  run_feature_selection,
10
  _handle_feature_selection,
11
  _csv_filename_to_pkl_filename,
 
12
  )
13
  from sklearn.utils.estimator_checks import check_estimator
14
  import sympy
@@ -403,6 +404,20 @@ class TestBest(unittest.TestCase):
403
  for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
404
  np.testing.assert_almost_equal(f(X), y, decimal=3)
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  class TestFeatureSelection(unittest.TestCase):
408
  def setUp(self):
 
9
  run_feature_selection,
10
  _handle_feature_selection,
11
  _csv_filename_to_pkl_filename,
12
+ idx_model_selection,
13
  )
14
  from sklearn.utils.estimator_checks import check_estimator
15
  import sympy
 
404
  for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
405
  np.testing.assert_almost_equal(f(X), y, decimal=3)
406
 
407
+ def test_all_selection_strategies(self):
408
+ equations = pd.DataFrame(
409
+ dict(
410
+ loss=[1.0, 0.1, 0.01, 0.001 * 1.4, 0.001],
411
+ score=[0.5, 1.0, 0.5, 0.5, 0.3],
412
+ )
413
+ )
414
+ idx_accuracy = idx_model_selection(equations, "accuracy")
415
+ self.assertEqual(idx_accuracy, 4)
416
+ idx_best = idx_model_selection(equations, "best")
417
+ self.assertEqual(idx_best, 3)
418
+ idx_score = idx_model_selection(equations, "score")
419
+ self.assertEqual(idx_score, 1)
420
+
421
 
422
  class TestFeatureSelection(unittest.TestCase):
423
  def setUp(self):