MilesCranmer commited on
Commit
9351408
·
1 Parent(s): c41cf33

Change "best" model_selection to apply loss threshold

Browse files
Files changed (1) hide show
  1. pysr/sr.py +40 -23
pysr/sr.py CHANGED
@@ -205,10 +205,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
205
  Parameters
206
  ----------
207
  model_selection : str, default="best"
208
- Model selection criterion. Can be 'accuracy' or 'best'.
209
- `"accuracy"` selects the candidate model with the lowest loss
210
- (highest accuracy). `"best"` selects the candidate model with
211
- the lowest sum of normalized loss and complexity.
 
 
 
 
 
 
212
 
213
  binary_operators : list[str], default=["+", "-", "*", "/"]
214
  List of strings giving the binary operators in Julia's Base.
@@ -469,7 +475,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
469
  Whether to use a progress bar instead of printing to stdout.
470
 
471
  equation_file : str, default=None
472
- Where to save the files (with `.csv` extension).
473
 
474
  temp_equation_file : bool, default=False
475
  Whether to put the hall of fame file in the temp directory.
@@ -943,12 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
943
 
944
  for i, equations in enumerate(all_equations):
945
  selected = ["" for _ in range(len(equations))]
946
- if self.model_selection == "accuracy":
947
- chosen_row = -1
948
- elif self.model_selection == "best":
949
- chosen_row = equations["score"].idxmax()
950
- else:
951
- raise NotImplementedError
952
  selected[chosen_row] = ">>>>"
953
  repr_equations = pd.DataFrame(
954
  dict(
@@ -1091,18 +1092,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1091
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1092
  return self.equations_.iloc[index]
1093
 
1094
- if self.model_selection == "accuracy":
1095
- if isinstance(self.equations_, list):
1096
- return [eq.iloc[-1] for eq in self.equations_]
1097
- return self.equations_.iloc[-1]
1098
- elif self.model_selection == "best":
1099
- if isinstance(self.equations_, list):
1100
- return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_]
1101
- return self.equations_.iloc[self.equations_["score"].idxmax()]
1102
- else:
1103
- raise NotImplementedError(
1104
- f"{self.model_selection} is not a valid model selection strategy."
1105
- )
1106
 
1107
  def _setup_equation_file(self):
1108
  """
@@ -2149,6 +2146,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2149
  return ret_outputs[0]
2150
 
2151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2152
  def _denoise(X, y, Xresampled=None, random_state=None):
2153
  """Denoise the dataset using a Gaussian process"""
2154
  from sklearn.gaussian_process import GaussianProcessRegressor
 
205
  Parameters
206
  ----------
207
  model_selection : str, default="best"
208
+ Model selection criterion. Can be 'accuracy', 'best', or 'score'.
209
+ - `"accuracy"` selects the candidate model with the lowest loss
210
+ (highest accuracy).
211
+ - `"score"` selects the candidate model with the highest score.
212
+ Score is defined as the derivative of the log-loss with
213
+ respect to complexity - if an expression has a much better
214
+ oss at a slightly higher complexity, it is preferred.
215
+ - `"best"` selects the candidate model with the highest score
216
+ among expressions with a loss better than at least 1.5x the
217
+ most accurate model.
218
 
219
  binary_operators : list[str], default=["+", "-", "*", "/"]
220
  List of strings giving the binary operators in Julia's Base.
 
475
  Whether to use a progress bar instead of printing to stdout.
476
 
477
  equation_file : str, default=None
478
+ Where to save the files (.csv extension).
479
 
480
  temp_equation_file : bool, default=False
481
  Whether to put the hall of fame file in the temp directory.
 
949
 
950
  for i, equations in enumerate(all_equations):
951
  selected = ["" for _ in range(len(equations))]
952
+ chosen_row = idx_model_selection(equations, self.model_selection)
 
 
 
 
 
953
  selected[chosen_row] = ">>>>"
954
  repr_equations = pd.DataFrame(
955
  dict(
 
1092
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1093
  return self.equations_.iloc[index]
1094
 
1095
+ if isinstance(self.equations_, list):
1096
+ return [
1097
+ eq.iloc[idx_model_selection(eq, self.model_selection)]
1098
+ for eq in self.equations_
1099
+ ]
1100
+ return self.equations_.iloc[
1101
+ idx_model_selection(self.equations_, self.model_selection)
1102
+ ]
 
 
 
 
1103
 
1104
  def _setup_equation_file(self):
1105
  """
 
2146
  return ret_outputs[0]
2147
 
2148
 
2149
+ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
2150
+ """
2151
+ Return the index of the selected expression, given a dataframe of
2152
+ equations and a model selection.
2153
+ """
2154
+ if model_selection == "accuracy":
2155
+ chosen_idx = equations["loss"].idxmin()
2156
+ elif model_selection == "best":
2157
+ threshold = 1.5 * equations["loss"].min()
2158
+ filtered_equations = equations.query(f"loss < {threshold}")
2159
+ chosen_idx = filtered_equations["score"].idxmax()
2160
+ elif model_selection == "score":
2161
+ chosen_idx = equations["score"].idxmax()
2162
+ else:
2163
+ raise NotImplementedError(
2164
+ f"{model_selection} is not a valid model selection strategy."
2165
+ )
2166
+ return chosen_idx
2167
+
2168
+
2169
  def _denoise(X, y, Xresampled=None, random_state=None):
2170
  """Denoise the dataset using a Gaussian process"""
2171
  from sklearn.gaussian_process import GaussianProcessRegressor