Tom Jelen MilesCranmer commited on
Commit
fb5f0a1
1 Parent(s): efffd9b

Fix TypeError when a variable name matches a builtin python function (#558)

Browse files

* fix thrown TypeError when a variable name matches a builtin python function

Example:

A dataset with a column named 'exec' failed with:

ValueError: Error from parse_expr with transformed code: "(Float ('86.76248' )-exec )"
... snip ...
TypeError: unsupported operand type(s) for -: 'Float' and 'builtin_function_or_method'

* Ensure backwards compatibility for `pysr2sympy` and use same method

* Fix potential issue with list ordering

* Combine builtin variable names test with noisy data test

* Fix builtin variable names test

---------

Co-authored-by: MilesCranmer <[email protected]>

Files changed (3) hide show
  1. pysr/export_sympy.py +14 -2
  2. pysr/sr.py +1 -0
  3. pysr/test/test.py +5 -2
pysr/export_sympy.py CHANGED
@@ -57,6 +57,12 @@ sympy_mappings = {
57
  }
58
 
59
 
 
 
 
 
 
 
60
  def create_sympy_symbols(
61
  feature_names_in: List[str],
62
  ) -> List[sympy.Symbol]:
@@ -64,10 +70,16 @@ def create_sympy_symbols(
64
 
65
 
66
  def pysr2sympy(
67
- equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
 
 
 
68
  ):
 
 
69
  local_sympy_mappings = {
70
- **(extra_sympy_mappings if extra_sympy_mappings else {}),
 
71
  **sympy_mappings,
72
  }
73
 
 
57
  }
58
 
59
 
60
+ def create_sympy_symbols_map(
61
+ feature_names_in: List[str],
62
+ ) -> Dict[str, sympy.Symbol]:
63
+ return {variable: sympy.Symbol(variable) for variable in feature_names_in}
64
+
65
+
66
  def create_sympy_symbols(
67
  feature_names_in: List[str],
68
  ) -> List[sympy.Symbol]:
 
70
 
71
 
72
  def pysr2sympy(
73
+ equation: str,
74
+ *,
75
+ feature_names_in: Optional[List[str]] = None,
76
+ extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
77
  ):
78
+ if feature_names_in is None:
79
+ feature_names_in = []
80
  local_sympy_mappings = {
81
+ **create_sympy_symbols_map(feature_names_in),
82
+ **(extra_sympy_mappings if extra_sympy_mappings is not None else {}),
83
  **sympy_mappings,
84
  }
85
 
pysr/sr.py CHANGED
@@ -2226,6 +2226,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2226
  for _, eqn_row in output.iterrows():
2227
  eqn = pysr2sympy(
2228
  eqn_row["equation"],
 
2229
  extra_sympy_mappings=self.extra_sympy_mappings,
2230
  )
2231
  sympy_format.append(eqn)
 
2226
  for _, eqn_row in output.iterrows():
2227
  eqn = pysr2sympy(
2228
  eqn_row["equation"],
2229
+ feature_names_in=self.feature_names_in_,
2230
  extra_sympy_mappings=self.extra_sympy_mappings,
2231
  )
2232
  sympy_format.append(eqn)
pysr/test/test.py CHANGED
@@ -272,7 +272,7 @@ class TestPipeline(unittest.TestCase):
272
  regressor = PySRRegressor(warm_start=True, max_evals=10)
273
  regressor.fit(self.X, y)
274
 
275
- def test_noisy(self):
276
  y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05
277
  model = PySRRegressor(
278
  # Test that passing a single operator works:
@@ -289,9 +289,12 @@ class TestPipeline(unittest.TestCase):
289
  model.set_params(model_selection="best")
290
  # Also try without a temp equation file:
291
  model.set_params(temp_equation_file=False)
292
- model.fit(self.X, y)
 
293
  self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
294
  self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
 
 
295
 
296
  def test_pandas_resample_with_nested_constraints(self):
297
  X = pd.DataFrame(
 
272
  regressor = PySRRegressor(warm_start=True, max_evals=10)
273
  regressor.fit(self.X, y)
274
 
275
+ def test_noisy_builtin_variable_names(self):
276
  y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05
277
  model = PySRRegressor(
278
  # Test that passing a single operator works:
 
289
  model.set_params(model_selection="best")
290
  # Also try without a temp equation file:
291
  model.set_params(temp_equation_file=False)
292
+ # We also test builtin variable names
293
+ model.fit(self.X, y, variable_names=["exec", "hash", "x3", "x4", "x5"])
294
  self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
295
  self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
296
+ self.assertIn("exec", model.latex()[0])
297
+ self.assertIn("hash", model.latex()[1])
298
 
299
  def test_pandas_resample_with_nested_constraints(self):
300
  X = pd.DataFrame(