MilesCranmer commited on
Commit
ffd9cd1
·
1 Parent(s): acce2c2

Add test for Xresampled as pd.DataFrame

Browse files
Files changed (2) hide show
  1. pysr/sr.py +26 -15
  2. test/test.py +49 -2
pysr/sr.py CHANGED
@@ -323,6 +323,8 @@ def pysr(
323
  if len(X.shape) == 1:
324
  X = X[:, None]
325
 
 
 
326
  if len(variable_names) == 0:
327
  variable_names = [f"x{i}" for i in range(X.shape[1])]
328
 
@@ -364,9 +366,7 @@ def pysr(
364
  if maxsize < 7:
365
  raise NotImplementedError("PySR requires a maxsize of at least 7")
366
 
367
- X, variable_names, selection = _handle_feature_selection(
368
- X, select_k_features, use_custom_variable_names, variable_names, y
369
- )
370
 
371
  if maxdepth is None:
372
  maxdepth = maxsize
@@ -390,9 +390,18 @@ def pysr(
390
  raise NotImplementedError(
391
  "No weights for denoising - the weights are learned."
392
  )
393
- if Xresampled is not None and selection is not None:
394
  # Select among only the selected features:
395
- Xresampled = Xresampled[:, selection]
 
 
 
 
 
 
 
 
 
396
  if multioutput:
397
  y = np.stack(
398
  [_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
@@ -531,7 +540,7 @@ Tried to activate project {julia_project} but failed."""
531
  Main.y,
532
  weights=Main.weights,
533
  niterations=int(niterations),
534
- varMap=variable_names,
535
  options=options,
536
  numprocs=int(cprocs),
537
  multithreading=bool(multithreading),
@@ -645,19 +654,15 @@ def _create_inline_operators(binary_operators, unary_operators):
645
  op_list[i] = function_name
646
 
647
 
648
- def _handle_feature_selection(
649
- X, select_k_features, use_custom_variable_names, variable_names, y
650
- ):
651
  if select_k_features is not None:
652
  selection = run_feature_selection(X, y, select_k_features)
653
- print(f"Using features {selection}")
654
  X = X[:, selection]
655
 
656
- if use_custom_variable_names:
657
- variable_names = [variable_names[i] for i in selection]
658
  else:
659
  selection = None
660
- return X, variable_names, selection
661
 
662
 
663
  def _check_assertions(
@@ -791,7 +796,9 @@ def get_hof(
791
  sympy_format.append(eqn)
792
 
793
  # Numpy:
794
- lambda_format.append(CallableEquation(sympy_symbols, eqn, selection))
 
 
795
 
796
  # JAX:
797
  if output_jax_format:
@@ -942,16 +949,20 @@ def _denoise(X, y, Xresampled=None):
942
  class CallableEquation:
943
  """Simple wrapper for numpy lambda functions built with sympy"""
944
 
945
- def __init__(self, sympy_symbols, eqn, selection=None):
946
  self._sympy = eqn
947
  self._sympy_symbols = sympy_symbols
948
  self._selection = selection
 
949
  self._lambda = lambdify(sympy_symbols, eqn)
950
 
951
  def __repr__(self):
952
  return f"PySRFunction(X=>{self._sympy})"
953
 
954
  def __call__(self, X):
 
 
 
955
  if self._selection is not None:
956
  return self._lambda(*X[:, self._selection].T)
957
  return self._lambda(*X.T)
 
323
  if len(X.shape) == 1:
324
  X = X[:, None]
325
 
326
+ assert not isinstance(y, pd.DataFrame)
327
+
328
  if len(variable_names) == 0:
329
  variable_names = [f"x{i}" for i in range(X.shape[1])]
330
 
 
366
  if maxsize < 7:
367
  raise NotImplementedError("PySR requires a maxsize of at least 7")
368
 
369
+ X, selection = _handle_feature_selection(X, select_k_features, y, variable_names)
 
 
370
 
371
  if maxdepth is None:
372
  maxdepth = maxsize
 
390
  raise NotImplementedError(
391
  "No weights for denoising - the weights are learned."
392
  )
393
+ if Xresampled is not None:
394
  # Select among only the selected features:
395
+ if isinstance(Xresampled, pd.DataFrame):
396
+ # Handle Xresampled is pandas dataframe
397
+ if selection is not None:
398
+ Xresampled = Xresampled[[variable_names[i] for i in selection]]
399
+ else:
400
+ Xresampled = Xresampled[variable_names]
401
+ Xresampled = np.array(Xresampled)
402
+ else:
403
+ if selection is not None:
404
+ Xresampled = Xresampled[:, selection]
405
  if multioutput:
406
  y = np.stack(
407
  [_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
 
540
  Main.y,
541
  weights=Main.weights,
542
  niterations=int(niterations),
543
+ varMap=[variable_names[i] for i in selection],
544
  options=options,
545
  numprocs=int(cprocs),
546
  multithreading=bool(multithreading),
 
654
  op_list[i] = function_name
655
 
656
 
657
+ def _handle_feature_selection(X, select_k_features, y, variable_names):
 
 
658
  if select_k_features is not None:
659
  selection = run_feature_selection(X, y, select_k_features)
660
+ print(f"Using features {[variable_names[i] for i in selection]}")
661
  X = X[:, selection]
662
 
 
 
663
  else:
664
  selection = None
665
+ return X, selection
666
 
667
 
668
  def _check_assertions(
 
796
  sympy_format.append(eqn)
797
 
798
  # Numpy:
799
+ lambda_format.append(
800
+ CallableEquation(sympy_symbols, eqn, selection, variable_names)
801
+ )
802
 
803
  # JAX:
804
  if output_jax_format:
 
949
  class CallableEquation:
950
  """Simple wrapper for numpy lambda functions built with sympy"""
951
 
952
+ def __init__(self, sympy_symbols, eqn, selection=None, variable_names=None):
953
  self._sympy = eqn
954
  self._sympy_symbols = sympy_symbols
955
  self._selection = selection
956
+ self._variable_names = variable_names
957
  self._lambda = lambdify(sympy_symbols, eqn)
958
 
959
  def __repr__(self):
960
  return f"PySRFunction(X=>{self._sympy})"
961
 
962
  def __call__(self, X):
963
+ if isinstance(X, pd.DataFrame):
964
+ X = np.array(X[self._variable_names])
965
+
966
  if self._selection is not None:
967
  return self._lambda(*X[:, self._selection].T)
968
  return self._lambda(*X.T)
test/test.py CHANGED
@@ -98,8 +98,9 @@ class TestPipeline(unittest.TestCase):
98
  equations = pysr(
99
  self.X,
100
  y,
101
- unary_operators=["sq(x) = x^2"],
102
- binary_operators=["plus"],
 
103
  extra_sympy_mappings={"sq": lambda x: x ** 2},
104
  **self.default_test_kwargs,
105
  procs=0,
@@ -108,6 +109,52 @@ class TestPipeline(unittest.TestCase):
108
  self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
109
  self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  class TestBest(unittest.TestCase):
113
  def setUp(self):
 
98
  equations = pysr(
99
  self.X,
100
  y,
101
+ # Test that passing a single operator works:
102
+ unary_operators="sq(x) = x^2",
103
+ binary_operators="plus",
104
  extra_sympy_mappings={"sq": lambda x: x ** 2},
105
  **self.default_test_kwargs,
106
  procs=0,
 
109
  self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
110
  self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
111
 
112
+ def test_pandas_resample(self):
113
+ np.random.seed(1)
114
+ X = pd.DataFrame(
115
+ {
116
+ "T": np.random.randn(500),
117
+ "x": np.random.randn(500),
118
+ "unused_feature": np.random.randn(500),
119
+ }
120
+ )
121
+ true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
122
+ y = true_fn(X)
123
+ noise = np.random.randn(500) * 0.01
124
+ y = y + noise
125
+ # Resampled array is a different order of features:
126
+ Xresampled = pd.DataFrame(
127
+ {
128
+ "unused_feature": np.random.randn(100),
129
+ "x": np.random.randn(100),
130
+ "T": np.random.randn(100),
131
+ }
132
+ )
133
+ equations = pysr(
134
+ X,
135
+ y,
136
+ unary_operators=[],
137
+ binary_operators=["+", "*", "/", "-"],
138
+ **self.default_test_kwargs,
139
+ Xresampled=Xresampled,
140
+ denoise=True,
141
+ select_k_features=2,
142
+ )
143
+ self.assertNotIn("unused_feature", best_tex())
144
+ self.assertIn("T", best_tex())
145
+ self.assertIn("x", best_tex())
146
+ self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
147
+ fn = best_callable()
148
+ self.assertListEqual(list(sorted(fn._selection)), [0, 1])
149
+ X2 = pd.DataFrame(
150
+ {
151
+ "T": np.random.randn(100),
152
+ "unused_feature": np.random.randn(100),
153
+ "x": np.random.randn(100),
154
+ }
155
+ )
156
+ self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
157
+
158
 
159
  class TestBest(unittest.TestCase):
160
  def setUp(self):