Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Merge pull request #177 from MilesCranmer/improved-model-selection
Browse filesChange "best" model_selection to include a loss threshold
- pysr/sr.py +42 -23
- test/test.py +15 -0
pysr/sr.py
CHANGED
@@ -205,10 +205,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
205 |
Parameters
|
206 |
----------
|
207 |
model_selection : str, default="best"
|
208 |
-
Model selection criterion
|
209 |
-
|
210 |
-
|
211 |
-
the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
213 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
214 |
List of strings giving the binary operators in Julia's Base.
|
@@ -469,7 +477,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
469 |
Whether to use a progress bar instead of printing to stdout.
|
470 |
|
471 |
equation_file : str, default=None
|
472 |
-
Where to save the files (
|
473 |
|
474 |
temp_equation_file : bool, default=False
|
475 |
Whether to put the hall of fame file in the temp directory.
|
@@ -943,12 +951,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
943 |
|
944 |
for i, equations in enumerate(all_equations):
|
945 |
selected = ["" for _ in range(len(equations))]
|
946 |
-
|
947 |
-
chosen_row = -1
|
948 |
-
elif self.model_selection == "best":
|
949 |
-
chosen_row = equations["score"].idxmax()
|
950 |
-
else:
|
951 |
-
raise NotImplementedError
|
952 |
selected[chosen_row] = ">>>>"
|
953 |
repr_equations = pd.DataFrame(
|
954 |
dict(
|
@@ -1091,18 +1094,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1091 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1092 |
return self.equations_.iloc[index]
|
1093 |
|
1094 |
-
if self.
|
1095 |
-
|
1096 |
-
|
1097 |
-
|
1098 |
-
|
1099 |
-
|
1100 |
-
|
1101 |
-
|
1102 |
-
else:
|
1103 |
-
raise NotImplementedError(
|
1104 |
-
f"{self.model_selection} is not a valid model selection strategy."
|
1105 |
-
)
|
1106 |
|
1107 |
def _setup_equation_file(self):
|
1108 |
"""
|
@@ -2149,6 +2148,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2149 |
return ret_outputs[0]
|
2150 |
|
2151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2152 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
2153 |
"""Denoise the dataset using a Gaussian process"""
|
2154 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
|
205 |
Parameters
|
206 |
----------
|
207 |
model_selection : str, default="best"
|
208 |
+
Model selection criterion when selecting a final expression from
|
209 |
+
the list of best expression at each complexity.
|
210 |
+
Can be 'accuracy', 'best', or 'score'.
|
211 |
+
- `"accuracy"` selects the candidate model with the lowest loss
|
212 |
+
(highest accuracy).
|
213 |
+
- `"score"` selects the candidate model with the highest score.
|
214 |
+
Score is defined as the negated derivative of the log-loss with
|
215 |
+
respect to complexity - if an expression has a much better
|
216 |
+
loss at a slightly higher complexity, it is preferred.
|
217 |
+
- `"best"` selects the candidate model with the highest score
|
218 |
+
among expressions with a loss better than at least 1.5x the
|
219 |
+
most accurate model.
|
220 |
|
221 |
binary_operators : list[str], default=["+", "-", "*", "/"]
|
222 |
List of strings giving the binary operators in Julia's Base.
|
|
|
477 |
Whether to use a progress bar instead of printing to stdout.
|
478 |
|
479 |
equation_file : str, default=None
|
480 |
+
Where to save the files (.csv extension).
|
481 |
|
482 |
temp_equation_file : bool, default=False
|
483 |
Whether to put the hall of fame file in the temp directory.
|
|
|
951 |
|
952 |
for i, equations in enumerate(all_equations):
|
953 |
selected = ["" for _ in range(len(equations))]
|
954 |
+
chosen_row = idx_model_selection(equations, self.model_selection)
|
|
|
|
|
|
|
|
|
|
|
955 |
selected[chosen_row] = ">>>>"
|
956 |
repr_equations = pd.DataFrame(
|
957 |
dict(
|
|
|
1094 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1095 |
return self.equations_.iloc[index]
|
1096 |
|
1097 |
+
if isinstance(self.equations_, list):
|
1098 |
+
return [
|
1099 |
+
eq.iloc[idx_model_selection(eq, self.model_selection)]
|
1100 |
+
for eq in self.equations_
|
1101 |
+
]
|
1102 |
+
return self.equations_.iloc[
|
1103 |
+
idx_model_selection(self.equations_, self.model_selection)
|
1104 |
+
]
|
|
|
|
|
|
|
|
|
1105 |
|
1106 |
def _setup_equation_file(self):
|
1107 |
"""
|
|
|
2148 |
return ret_outputs[0]
|
2149 |
|
2150 |
|
2151 |
+
def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
2152 |
+
"""
|
2153 |
+
Return the index of the selected expression, given a dataframe of
|
2154 |
+
equations and a model selection.
|
2155 |
+
"""
|
2156 |
+
if model_selection == "accuracy":
|
2157 |
+
chosen_idx = equations["loss"].idxmin()
|
2158 |
+
elif model_selection == "best":
|
2159 |
+
threshold = 1.5 * equations["loss"].min()
|
2160 |
+
filtered_equations = equations.query(f"loss <= {threshold}")
|
2161 |
+
chosen_idx = filtered_equations["score"].idxmax()
|
2162 |
+
elif model_selection == "score":
|
2163 |
+
chosen_idx = equations["score"].idxmax()
|
2164 |
+
else:
|
2165 |
+
raise NotImplementedError(
|
2166 |
+
f"{model_selection} is not a valid model selection strategy."
|
2167 |
+
)
|
2168 |
+
return chosen_idx
|
2169 |
+
|
2170 |
+
|
2171 |
def _denoise(X, y, Xresampled=None, random_state=None):
|
2172 |
"""Denoise the dataset using a Gaussian process"""
|
2173 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
test/test.py
CHANGED
@@ -9,6 +9,7 @@ from pysr.sr import (
|
|
9 |
run_feature_selection,
|
10 |
_handle_feature_selection,
|
11 |
_csv_filename_to_pkl_filename,
|
|
|
12 |
)
|
13 |
from sklearn.utils.estimator_checks import check_estimator
|
14 |
import sympy
|
@@ -403,6 +404,20 @@ class TestBest(unittest.TestCase):
|
|
403 |
for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
|
404 |
np.testing.assert_almost_equal(f(X), y, decimal=3)
|
405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
class TestFeatureSelection(unittest.TestCase):
|
408 |
def setUp(self):
|
|
|
9 |
run_feature_selection,
|
10 |
_handle_feature_selection,
|
11 |
_csv_filename_to_pkl_filename,
|
12 |
+
idx_model_selection,
|
13 |
)
|
14 |
from sklearn.utils.estimator_checks import check_estimator
|
15 |
import sympy
|
|
|
404 |
for f in [self.model.predict, self.equations_.iloc[-1]["lambda_format"]]:
|
405 |
np.testing.assert_almost_equal(f(X), y, decimal=3)
|
406 |
|
407 |
+
def test_all_selection_strategies(self):
|
408 |
+
equations = pd.DataFrame(
|
409 |
+
dict(
|
410 |
+
loss=[1.0, 0.1, 0.01, 0.001 * 1.4, 0.001],
|
411 |
+
score=[0.5, 1.0, 0.5, 0.5, 0.3],
|
412 |
+
)
|
413 |
+
)
|
414 |
+
idx_accuracy = idx_model_selection(equations, "accuracy")
|
415 |
+
self.assertEqual(idx_accuracy, 4)
|
416 |
+
idx_best = idx_model_selection(equations, "best")
|
417 |
+
self.assertEqual(idx_best, 3)
|
418 |
+
idx_score = idx_model_selection(equations, "score")
|
419 |
+
self.assertEqual(idx_score, 1)
|
420 |
+
|
421 |
|
422 |
class TestFeatureSelection(unittest.TestCase):
|
423 |
def setUp(self):
|