Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

PySR / test /test.py

MilesCranmer

Update parts of test to use ScikitLearn interface

af14165 over 3 years ago

raw

history blame

8.77 kB

	import unittest
	from unittest.mock import patch
	import numpy as np
	from pysr import PySRRegressor
	from pysr.sr import run_feature_selection, _handle_feature_selection
	import sympy
	from sympy import lambdify
	import pandas as pd


	class TestPipeline(unittest.TestCase):
	def setUp(self):
	self.default_test_kwargs = dict(
	niterations=10,
	populations=4,
	annealing=True,
	useFrequency=False,
	)
	np.random.seed(0)
	self.X = np.random.randn(100, 5)

	def test_linear_relation(self):
	y = self.X[:, 0]
	model = PySRRegressor(**self.default_test_kwargs)
	model.fit(self.X, y)
	model.set_params(model_selection="accuracy")
	print(model.equations)
	self.assertLessEqual(model.get_best()["loss"], 1e-4)

	def test_multiprocessing(self):
	y = self.X[:, 0]
	model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False)
	model.fit(self.X, y)
	print(model.equations)
	self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4)

	def test_multioutput_custom_operator(self):
	y = self.X[:, [0, 1]] ** 2
	model = PySRRegressor(
	unary_operators=["sq(x) = x^2"],
	extra_sympy_mappings={"sq": lambda x: x ** 2},
	binary_operators=["plus"],
	**self.default_test_kwargs,
	procs=0,
	)
	model.fit(self.X, y)
	equations = model.equations
	print(equations)
	self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
	self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)

	def test_multioutput_weighted_with_callable_temp_equation(self):
	y = self.X[:, [0, 1]] ** 2
	w = np.random.rand(*y.shape)
	w[w < 0.5] = 0.0
	w[w >= 0.5] = 1.0

	# Double equation when weights are 0:
	y = (2 - w) * y
	# Thus, pysr needs to use the weights to find the right equation!

	model = PySRRegressor(
	unary_operators=["sq(x) = x^2"],
	binary_operators=["plus"],
	extra_sympy_mappings={"sq": lambda x: x ** 2},
	**self.default_test_kwargs,
	procs=0,
	temp_equation_file=True,
	delete_tempfiles=False,
	)
	model.fit(self.X, y, weights=w)

	np.testing.assert_almost_equal(
	model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
	)
	np.testing.assert_almost_equal(
	model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
	)

	def test_empty_operators_single_input_sklearn(self):
	X = np.random.randn(100, 1)
	y = X[:, 0] + 3.0
	regressor = PySRRegressor(
	model_selection="accuracy",
	unary_operators=[],
	binary_operators=["plus"],
	**self.default_test_kwargs,
	)
	self.assertTrue("None" in regressor.__repr__())
	regressor.fit(X, y)
	self.assertTrue("None" not in regressor.__repr__())
	self.assertTrue(">>>>" in regressor.__repr__())

	self.assertLessEqual(regressor.equations.iloc[-1]["MSE"], 1e-4)
	np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)

	# Tweak model selection:
	regressor.set_params(model_selection="best")
	self.assertEqual(regressor.get_params()["model_selection"], "best")
	self.assertTrue("None" not in regressor.__repr__())
	self.assertTrue(">>>>" in regressor.__repr__())

	# "best" model_selection should also give a decent loss:
	np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)

	def test_noisy(self):

	np.random.seed(1)
	y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
	model = PySRRegressor(
	# Test that passing a single operator works:
	unary_operators="sq(x) = x^2",
	binary_operators="plus",
	extra_sympy_mappings={"sq": lambda x: x ** 2},
	**self.default_test_kwargs,
	procs=0,
	denoise=True,
	)
	model.fit(self.X, y)
	self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
	self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)

	def test_pandas_resample(self):
	np.random.seed(1)
	X = pd.DataFrame(
	{
	"T": np.random.randn(500),
	"x": np.random.randn(500),
	"unused_feature": np.random.randn(500),
	}
	)
	true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
	y = true_fn(X)
	noise = np.random.randn(500) * 0.01
	y = y + noise
	# Resampled array is a different order of features:
	Xresampled = pd.DataFrame(
	{
	"unused_feature": np.random.randn(100),
	"x": np.random.randn(100),
	"T": np.random.randn(100),
	}
	)
	model = PySRRegressor(
	unary_operators=[],
	binary_operators=["+", "*", "/", "-"],
	**self.default_test_kwargs,
	Xresampled=Xresampled,
	denoise=True,
	select_k_features=2,
	)
	model.fit(X, y)
	self.assertNotIn("unused_feature", model.latex())
	self.assertIn("T", model.latex())
	self.assertIn("x", model.latex())
	self.assertLessEqual(model.get_best()["loss"], 1e-2)
	fn = model.get_best()['lambda_format']
	self.assertListEqual(list(sorted(fn._selection)), [0, 1])
	X2 = pd.DataFrame(
	{
	"T": np.random.randn(100),
	"unused_feature": np.random.randn(100),
	"x": np.random.randn(100),
	}
	)
	self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
	self.assertLess(np.average((model.predict(X2) - true_fn(X2)) ** 2), 1e-2)


	class TestBest(unittest.TestCase):
	def setUp(self):
	equations = pd.DataFrame(
	{
	"Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
	"MSE": [1.0, 0.1, 1e-5],
	"Complexity": [1, 2, 3],
	}
	)

	equations["Complexity MSE Equation".split(" ")].to_csv(
	"equation_file.csv.bkup", sep="\|"
	)

	self.equations = get_hof(
	"equation_file.csv",
	n_features=2,
	variables_names="x0 x1".split(" "),
	extra_sympy_mappings={},
	output_jax_format=False,
	multioutput=False,
	nout=1,
	)

	self.model = PySRRegressor()
	self.model.equations = self.equations

	def test_best(self):
	self.assertEqual(best(self.equations), sympy.cos(sympy.Symbol("x0")) ** 2)
	self.assertEqual(best(), sympy.cos(sympy.Symbol("x0")) ** 2)
	self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)

	def test_best_tex(self):
	self.assertEqual(best_tex(self.equations), "\\cos^{2}{\\left(x_{0} \\right)}")
	self.assertEqual(best_tex(), "\\cos^{2}{\\left(x_{0} \\right)}")
	self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")

	def test_best_lambda(self):
	X = np.random.randn(10, 2)
	y = np.cos(X[:, 0]) ** 2
	for f in [best_callable(), best_callable(self.equations)]:
	np.testing.assert_almost_equal(f(X), y, decimal=4)


	class TestFeatureSelection(unittest.TestCase):
	def setUp(self):
	np.random.seed(0)

	def test_feature_selection(self):
	X = np.random.randn(20000, 5)
	y = X[:, 2] 2 + X[:, 3] 2
	selected = run_feature_selection(X, y, select_k_features=2)
	self.assertEqual(sorted(selected), [2, 3])

	def test_feature_selection_handler(self):
	X = np.random.randn(20000, 5)
	y = X[:, 2] 2 + X[:, 3] 2
	var_names = [f"x{i}" for i in range(5)]
	selected_X, selection = _handle_feature_selection(
	X,
	select_k_features=2,
	variable_names=var_names,
	y=y,
	)
	self.assertTrue((2 in selection) and (3 in selection))
	selected_var_names = [var_names[i] for i in selection]
	self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
	np.testing.assert_array_equal(
	np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
	)


	class TestHelperFunctions(unittest.TestCase):
	@patch("builtins.input", side_effect=["y", "n"])
	def test_yesno(self, mock_input):
	# Assert that the yes/no function correctly deals with y/n
	self.assertEqual(_yesno("Test"), True)
	self.assertEqual(_yesno("Test"), False)