Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
•
ffd9cd1
1
Parent(s):
acce2c2
Add test for Xresampled as pd.DataFrame
Browse files- pysr/sr.py +26 -15
- test/test.py +49 -2
pysr/sr.py
CHANGED
@@ -323,6 +323,8 @@ def pysr(
|
|
323 |
if len(X.shape) == 1:
|
324 |
X = X[:, None]
|
325 |
|
|
|
|
|
326 |
if len(variable_names) == 0:
|
327 |
variable_names = [f"x{i}" for i in range(X.shape[1])]
|
328 |
|
@@ -364,9 +366,7 @@ def pysr(
|
|
364 |
if maxsize < 7:
|
365 |
raise NotImplementedError("PySR requires a maxsize of at least 7")
|
366 |
|
367 |
-
X,
|
368 |
-
X, select_k_features, use_custom_variable_names, variable_names, y
|
369 |
-
)
|
370 |
|
371 |
if maxdepth is None:
|
372 |
maxdepth = maxsize
|
@@ -390,9 +390,18 @@ def pysr(
|
|
390 |
raise NotImplementedError(
|
391 |
"No weights for denoising - the weights are learned."
|
392 |
)
|
393 |
-
if Xresampled is not None
|
394 |
# Select among only the selected features:
|
395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
if multioutput:
|
397 |
y = np.stack(
|
398 |
[_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
|
@@ -531,7 +540,7 @@ Tried to activate project {julia_project} but failed."""
|
|
531 |
Main.y,
|
532 |
weights=Main.weights,
|
533 |
niterations=int(niterations),
|
534 |
-
varMap=variable_names,
|
535 |
options=options,
|
536 |
numprocs=int(cprocs),
|
537 |
multithreading=bool(multithreading),
|
@@ -645,19 +654,15 @@ def _create_inline_operators(binary_operators, unary_operators):
|
|
645 |
op_list[i] = function_name
|
646 |
|
647 |
|
648 |
-
def _handle_feature_selection(
|
649 |
-
X, select_k_features, use_custom_variable_names, variable_names, y
|
650 |
-
):
|
651 |
if select_k_features is not None:
|
652 |
selection = run_feature_selection(X, y, select_k_features)
|
653 |
-
print(f"Using features {selection}")
|
654 |
X = X[:, selection]
|
655 |
|
656 |
-
if use_custom_variable_names:
|
657 |
-
variable_names = [variable_names[i] for i in selection]
|
658 |
else:
|
659 |
selection = None
|
660 |
-
return X,
|
661 |
|
662 |
|
663 |
def _check_assertions(
|
@@ -791,7 +796,9 @@ def get_hof(
|
|
791 |
sympy_format.append(eqn)
|
792 |
|
793 |
# Numpy:
|
794 |
-
lambda_format.append(
|
|
|
|
|
795 |
|
796 |
# JAX:
|
797 |
if output_jax_format:
|
@@ -942,16 +949,20 @@ def _denoise(X, y, Xresampled=None):
|
|
942 |
class CallableEquation:
|
943 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
944 |
|
945 |
-
def __init__(self, sympy_symbols, eqn, selection=None):
|
946 |
self._sympy = eqn
|
947 |
self._sympy_symbols = sympy_symbols
|
948 |
self._selection = selection
|
|
|
949 |
self._lambda = lambdify(sympy_symbols, eqn)
|
950 |
|
951 |
def __repr__(self):
|
952 |
return f"PySRFunction(X=>{self._sympy})"
|
953 |
|
954 |
def __call__(self, X):
|
|
|
|
|
|
|
955 |
if self._selection is not None:
|
956 |
return self._lambda(*X[:, self._selection].T)
|
957 |
return self._lambda(*X.T)
|
|
|
323 |
if len(X.shape) == 1:
|
324 |
X = X[:, None]
|
325 |
|
326 |
+
assert not isinstance(y, pd.DataFrame)
|
327 |
+
|
328 |
if len(variable_names) == 0:
|
329 |
variable_names = [f"x{i}" for i in range(X.shape[1])]
|
330 |
|
|
|
366 |
if maxsize < 7:
|
367 |
raise NotImplementedError("PySR requires a maxsize of at least 7")
|
368 |
|
369 |
+
X, selection = _handle_feature_selection(X, select_k_features, y, variable_names)
|
|
|
|
|
370 |
|
371 |
if maxdepth is None:
|
372 |
maxdepth = maxsize
|
|
|
390 |
raise NotImplementedError(
|
391 |
"No weights for denoising - the weights are learned."
|
392 |
)
|
393 |
+
if Xresampled is not None:
|
394 |
# Select among only the selected features:
|
395 |
+
if isinstance(Xresampled, pd.DataFrame):
|
396 |
+
# Handle Xresampled is pandas dataframe
|
397 |
+
if selection is not None:
|
398 |
+
Xresampled = Xresampled[[variable_names[i] for i in selection]]
|
399 |
+
else:
|
400 |
+
Xresampled = Xresampled[variable_names]
|
401 |
+
Xresampled = np.array(Xresampled)
|
402 |
+
else:
|
403 |
+
if selection is not None:
|
404 |
+
Xresampled = Xresampled[:, selection]
|
405 |
if multioutput:
|
406 |
y = np.stack(
|
407 |
[_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
|
|
|
540 |
Main.y,
|
541 |
weights=Main.weights,
|
542 |
niterations=int(niterations),
|
543 |
+
varMap=[variable_names[i] for i in selection],
|
544 |
options=options,
|
545 |
numprocs=int(cprocs),
|
546 |
multithreading=bool(multithreading),
|
|
|
654 |
op_list[i] = function_name
|
655 |
|
656 |
|
657 |
+
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
|
|
|
|
658 |
if select_k_features is not None:
|
659 |
selection = run_feature_selection(X, y, select_k_features)
|
660 |
+
print(f"Using features {[variable_names[i] for i in selection]}")
|
661 |
X = X[:, selection]
|
662 |
|
|
|
|
|
663 |
else:
|
664 |
selection = None
|
665 |
+
return X, selection
|
666 |
|
667 |
|
668 |
def _check_assertions(
|
|
|
796 |
sympy_format.append(eqn)
|
797 |
|
798 |
# Numpy:
|
799 |
+
lambda_format.append(
|
800 |
+
CallableEquation(sympy_symbols, eqn, selection, variable_names)
|
801 |
+
)
|
802 |
|
803 |
# JAX:
|
804 |
if output_jax_format:
|
|
|
949 |
class CallableEquation:
|
950 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
951 |
|
952 |
+
def __init__(self, sympy_symbols, eqn, selection=None, variable_names=None):
|
953 |
self._sympy = eqn
|
954 |
self._sympy_symbols = sympy_symbols
|
955 |
self._selection = selection
|
956 |
+
self._variable_names = variable_names
|
957 |
self._lambda = lambdify(sympy_symbols, eqn)
|
958 |
|
959 |
def __repr__(self):
|
960 |
return f"PySRFunction(X=>{self._sympy})"
|
961 |
|
962 |
def __call__(self, X):
|
963 |
+
if isinstance(X, pd.DataFrame):
|
964 |
+
X = np.array(X[self._variable_names])
|
965 |
+
|
966 |
if self._selection is not None:
|
967 |
return self._lambda(*X[:, self._selection].T)
|
968 |
return self._lambda(*X.T)
|
test/test.py
CHANGED
@@ -98,8 +98,9 @@ class TestPipeline(unittest.TestCase):
|
|
98 |
equations = pysr(
|
99 |
self.X,
|
100 |
y,
|
101 |
-
|
102 |
-
|
|
|
103 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
104 |
**self.default_test_kwargs,
|
105 |
procs=0,
|
@@ -108,6 +109,52 @@ class TestPipeline(unittest.TestCase):
|
|
108 |
self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
|
109 |
self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
class TestBest(unittest.TestCase):
|
113 |
def setUp(self):
|
|
|
98 |
equations = pysr(
|
99 |
self.X,
|
100 |
y,
|
101 |
+
# Test that passing a single operator works:
|
102 |
+
unary_operators="sq(x) = x^2",
|
103 |
+
binary_operators="plus",
|
104 |
extra_sympy_mappings={"sq": lambda x: x ** 2},
|
105 |
**self.default_test_kwargs,
|
106 |
procs=0,
|
|
|
109 |
self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
|
110 |
self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
|
111 |
|
112 |
+
def test_pandas_resample(self):
|
113 |
+
np.random.seed(1)
|
114 |
+
X = pd.DataFrame(
|
115 |
+
{
|
116 |
+
"T": np.random.randn(500),
|
117 |
+
"x": np.random.randn(500),
|
118 |
+
"unused_feature": np.random.randn(500),
|
119 |
+
}
|
120 |
+
)
|
121 |
+
true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
|
122 |
+
y = true_fn(X)
|
123 |
+
noise = np.random.randn(500) * 0.01
|
124 |
+
y = y + noise
|
125 |
+
# Resampled array is a different order of features:
|
126 |
+
Xresampled = pd.DataFrame(
|
127 |
+
{
|
128 |
+
"unused_feature": np.random.randn(100),
|
129 |
+
"x": np.random.randn(100),
|
130 |
+
"T": np.random.randn(100),
|
131 |
+
}
|
132 |
+
)
|
133 |
+
equations = pysr(
|
134 |
+
X,
|
135 |
+
y,
|
136 |
+
unary_operators=[],
|
137 |
+
binary_operators=["+", "*", "/", "-"],
|
138 |
+
**self.default_test_kwargs,
|
139 |
+
Xresampled=Xresampled,
|
140 |
+
denoise=True,
|
141 |
+
select_k_features=2,
|
142 |
+
)
|
143 |
+
self.assertNotIn("unused_feature", best_tex())
|
144 |
+
self.assertIn("T", best_tex())
|
145 |
+
self.assertIn("x", best_tex())
|
146 |
+
self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
|
147 |
+
fn = best_callable()
|
148 |
+
self.assertListEqual(list(sorted(fn._selection)), [0, 1])
|
149 |
+
X2 = pd.DataFrame(
|
150 |
+
{
|
151 |
+
"T": np.random.randn(100),
|
152 |
+
"unused_feature": np.random.randn(100),
|
153 |
+
"x": np.random.randn(100),
|
154 |
+
}
|
155 |
+
)
|
156 |
+
self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
|
157 |
+
|
158 |
|
159 |
class TestBest(unittest.TestCase):
|
160 |
def setUp(self):
|