Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
•
3a3b168
1
Parent(s):
a548719
feat: allow list-like `complexity_of_variables`
Browse files- pysr/sr.py +98 -12
pysr/sr.py
CHANGED
@@ -137,6 +137,7 @@ def _check_assertions(
|
|
137 |
X,
|
138 |
use_custom_variable_names,
|
139 |
variable_names,
|
|
|
140 |
weights,
|
141 |
y,
|
142 |
X_units,
|
@@ -161,6 +162,13 @@ def _check_assertions(
|
|
161 |
"and underscores are allowed."
|
162 |
)
|
163 |
assert_valid_sympy_symbol(var_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
if X_units is not None and len(X_units) != X.shape[1]:
|
165 |
raise ValueError(
|
166 |
"The number of units in `X_units` must equal the number of features in `X`."
|
@@ -331,7 +339,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
331 |
`idx` argument to the function, which is `nothing`
|
332 |
for non-batched, and a 1D array of indices for batched.
|
333 |
Default is `None`.
|
334 |
-
complexity_of_operators : dict[str, float]
|
335 |
If you would like to use a complexity other than 1 for an
|
336 |
operator, specify the complexity here. For example,
|
337 |
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
|
@@ -340,10 +348,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
340 |
numbers for a complexity, and the total complexity of a tree
|
341 |
will be rounded to the nearest integer after computing.
|
342 |
Default is `None`.
|
343 |
-
complexity_of_constants : float
|
344 |
Complexity of constants. Default is `1`.
|
345 |
-
complexity_of_variables : float
|
346 |
-
|
|
|
|
|
|
|
347 |
parsimony : float
|
348 |
Multiplicative factor for how much to punish complexity.
|
349 |
Default is `0.0032`.
|
@@ -689,6 +700,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
689 |
n_features_in_: int
|
690 |
feature_names_in_: ArrayLike[str]
|
691 |
display_feature_names_in_: ArrayLike[str]
|
|
|
692 |
X_units_: Union[ArrayLike[str], None]
|
693 |
y_units_: Union[str, ArrayLike[str], None]
|
694 |
nout_: int
|
@@ -720,7 +732,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
720 |
loss_function: Optional[str] = None,
|
721 |
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
|
722 |
complexity_of_constants: Union[int, float] = 1,
|
723 |
-
complexity_of_variables: Union[int, float] =
|
724 |
parsimony: float = 0.0032,
|
725 |
dimensional_constraint_penalty: Optional[float] = None,
|
726 |
dimensionless_constants_only: bool = False,
|
@@ -1338,13 +1350,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1338 |
return param_container
|
1339 |
|
1340 |
def _validate_and_set_fit_params(
|
1341 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1342 |
) -> Tuple[
|
1343 |
ndarray,
|
1344 |
ndarray,
|
1345 |
Optional[ndarray],
|
1346 |
Optional[ndarray],
|
1347 |
ArrayLike[str],
|
|
|
1348 |
Optional[ArrayLike[str]],
|
1349 |
Optional[Union[str, ArrayLike[str]]],
|
1350 |
]:
|
@@ -1369,6 +1390,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1369 |
for that particular element of y.
|
1370 |
variable_names : ndarray of length n_features
|
1371 |
Names of each variable in the training dataset, `X`.
|
|
|
|
|
1372 |
X_units : list[str] of length n_features
|
1373 |
Units of each variable in the training dataset, `X`.
|
1374 |
y_units : str | list[str] of length n_out
|
@@ -1416,6 +1439,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1416 |
"Please use valid names instead."
|
1417 |
)
|
1418 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1419 |
# Data validation and feature name fetching via sklearn
|
1420 |
# This method sets the n_features_in_ attribute
|
1421 |
if Xresampled is not None:
|
@@ -1446,10 +1485,20 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1446 |
else:
|
1447 |
raise NotImplementedError("y shape not supported!")
|
1448 |
|
|
|
1449 |
self.X_units_ = copy.deepcopy(X_units)
|
1450 |
self.y_units_ = copy.deepcopy(y_units)
|
1451 |
|
1452 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1453 |
|
1454 |
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1455 |
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
@@ -1465,6 +1514,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1465 |
y: ndarray,
|
1466 |
Xresampled: Union[ndarray, None],
|
1467 |
variable_names: ArrayLike[str],
|
|
|
1468 |
X_units: Union[ArrayLike[str], None],
|
1469 |
y_units: Union[ArrayLike[str], str, None],
|
1470 |
random_state: np.random.RandomState,
|
@@ -1487,6 +1537,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1487 |
variable_names : list[str]
|
1488 |
Names of each variable in the training dataset, `X`.
|
1489 |
Of length `n_features`.
|
|
|
|
|
1490 |
X_units : list[str]
|
1491 |
Units of each variable in the training dataset, `X`.
|
1492 |
y_units : str | list[str]
|
@@ -1537,6 +1589,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1537 |
],
|
1538 |
)
|
1539 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1540 |
if X_units is not None:
|
1541 |
X_units = cast(
|
1542 |
ArrayLike[str],
|
@@ -1561,7 +1621,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1561 |
else:
|
1562 |
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1563 |
|
1564 |
-
return X, y, variable_names, X_units, y_units
|
1565 |
|
1566 |
def _run(
|
1567 |
self,
|
@@ -1618,6 +1678,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1618 |
|
1619 |
nested_constraints = self.nested_constraints
|
1620 |
complexity_of_operators = self.complexity_of_operators
|
|
|
1621 |
cluster_manager = self.cluster_manager
|
1622 |
|
1623 |
# Start julia backend processes
|
@@ -1662,6 +1723,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1662 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
1663 |
# TODO: Refactor this into helper function
|
1664 |
|
|
|
|
|
|
|
1665 |
custom_loss = jl.seval(
|
1666 |
str(self.elementwise_loss)
|
1667 |
if self.elementwise_loss is not None
|
@@ -1720,7 +1784,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1720 |
una_constraints=jl_array(una_constraints),
|
1721 |
complexity_of_operators=complexity_of_operators,
|
1722 |
complexity_of_constants=self.complexity_of_constants,
|
1723 |
-
complexity_of_variables=
|
1724 |
nested_constraints=nested_constraints,
|
1725 |
elementwise_loss=custom_loss,
|
1726 |
loss_function=custom_full_objective,
|
@@ -1865,6 +1929,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1865 |
Xresampled=None,
|
1866 |
weights=None,
|
1867 |
variable_names: Optional[ArrayLike[str]] = None,
|
|
|
|
|
|
|
1868 |
X_units: Optional[ArrayLike[str]] = None,
|
1869 |
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
1870 |
) -> "PySRRegressor":
|
@@ -1925,6 +1992,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1925 |
self.selection_mask_ = None
|
1926 |
self.julia_state_stream_ = None
|
1927 |
self.julia_options_stream_ = None
|
|
|
1928 |
self.X_units_ = None
|
1929 |
self.y_units_ = None
|
1930 |
|
@@ -1938,10 +2006,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1938 |
Xresampled,
|
1939 |
weights,
|
1940 |
variable_names,
|
|
|
1941 |
X_units,
|
1942 |
y_units,
|
1943 |
) = self._validate_and_set_fit_params(
|
1944 |
-
X,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1945 |
)
|
1946 |
|
1947 |
if X.shape[0] > 10000 and not self.batching:
|
@@ -1959,8 +2035,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1959 |
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
|
1960 |
|
1961 |
# Pre transformations (feature selection and denoising)
|
1962 |
-
X, y, variable_names, X_units, y_units =
|
1963 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1964 |
)
|
1965 |
|
1966 |
# Warn about large feature counts (still warn if feature count is large
|
@@ -1987,6 +2072,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1987 |
X,
|
1988 |
use_custom_variable_names,
|
1989 |
variable_names,
|
|
|
1990 |
weights,
|
1991 |
y,
|
1992 |
X_units,
|
|
|
137 |
X,
|
138 |
use_custom_variable_names,
|
139 |
variable_names,
|
140 |
+
complexity_of_variables,
|
141 |
weights,
|
142 |
y,
|
143 |
X_units,
|
|
|
162 |
"and underscores are allowed."
|
163 |
)
|
164 |
assert_valid_sympy_symbol(var_name)
|
165 |
+
if (
|
166 |
+
isinstance(complexity_of_variables, list)
|
167 |
+
and len(complexity_of_variables) != X.shape[1]
|
168 |
+
):
|
169 |
+
raise ValueError(
|
170 |
+
"The number of elements in `complexity_of_variables` must equal the number of features in `X`."
|
171 |
+
)
|
172 |
if X_units is not None and len(X_units) != X.shape[1]:
|
173 |
raise ValueError(
|
174 |
"The number of units in `X_units` must equal the number of features in `X`."
|
|
|
339 |
`idx` argument to the function, which is `nothing`
|
340 |
for non-batched, and a 1D array of indices for batched.
|
341 |
Default is `None`.
|
342 |
+
complexity_of_operators : dict[str, Union[int, float]]
|
343 |
If you would like to use a complexity other than 1 for an
|
344 |
operator, specify the complexity here. For example,
|
345 |
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
|
|
|
348 |
numbers for a complexity, and the total complexity of a tree
|
349 |
will be rounded to the nearest integer after computing.
|
350 |
Default is `None`.
|
351 |
+
complexity_of_constants : int | float
|
352 |
Complexity of constants. Default is `1`.
|
353 |
+
complexity_of_variables : int | float
|
354 |
+
Global complexity of variables. To set different complexities for
|
355 |
+
different variables, pass a list of complexities to the `fit` method
|
356 |
+
with keyword `complexity_of_variables`. You cannot use both.
|
357 |
+
Default is `1`.
|
358 |
parsimony : float
|
359 |
Multiplicative factor for how much to punish complexity.
|
360 |
Default is `0.0032`.
|
|
|
700 |
n_features_in_: int
|
701 |
feature_names_in_: ArrayLike[str]
|
702 |
display_feature_names_in_: ArrayLike[str]
|
703 |
+
complexity_of_variables_: Union[int, float, List[Union[int, float]]]
|
704 |
X_units_: Union[ArrayLike[str], None]
|
705 |
y_units_: Union[str, ArrayLike[str], None]
|
706 |
nout_: int
|
|
|
732 |
loss_function: Optional[str] = None,
|
733 |
complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
|
734 |
complexity_of_constants: Union[int, float] = 1,
|
735 |
+
complexity_of_variables: Optional[Union[int, float]] = None,
|
736 |
parsimony: float = 0.0032,
|
737 |
dimensional_constraint_penalty: Optional[float] = None,
|
738 |
dimensionless_constants_only: bool = False,
|
|
|
1350 |
return param_container
|
1351 |
|
1352 |
def _validate_and_set_fit_params(
|
1353 |
+
self,
|
1354 |
+
X,
|
1355 |
+
y,
|
1356 |
+
Xresampled,
|
1357 |
+
weights,
|
1358 |
+
variable_names,
|
1359 |
+
complexity_of_variables,
|
1360 |
+
X_units,
|
1361 |
+
y_units,
|
1362 |
) -> Tuple[
|
1363 |
ndarray,
|
1364 |
ndarray,
|
1365 |
Optional[ndarray],
|
1366 |
Optional[ndarray],
|
1367 |
ArrayLike[str],
|
1368 |
+
Union[int, float, List[Union[int, float]]],
|
1369 |
Optional[ArrayLike[str]],
|
1370 |
Optional[Union[str, ArrayLike[str]]],
|
1371 |
]:
|
|
|
1390 |
for that particular element of y.
|
1391 |
variable_names : ndarray of length n_features
|
1392 |
Names of each variable in the training dataset, `X`.
|
1393 |
+
complexity_of_variables : int | float | list[int | float]
|
1394 |
+
Complexity of each variable in the training dataset, `X`.
|
1395 |
X_units : list[str] of length n_features
|
1396 |
Units of each variable in the training dataset, `X`.
|
1397 |
y_units : str | list[str] of length n_out
|
|
|
1439 |
"Please use valid names instead."
|
1440 |
)
|
1441 |
|
1442 |
+
if (
|
1443 |
+
complexity_of_variables is not None
|
1444 |
+
and self.complexity_of_variables is not None
|
1445 |
+
):
|
1446 |
+
raise ValueError(
|
1447 |
+
"You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
|
1448 |
+
"Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
|
1449 |
+
"each variable individually."
|
1450 |
+
)
|
1451 |
+
elif complexity_of_variables is not None:
|
1452 |
+
complexity_of_variables = complexity_of_variables
|
1453 |
+
elif self.complexity_of_variables is not None:
|
1454 |
+
complexity_of_variables = self.complexity_of_variables
|
1455 |
+
else:
|
1456 |
+
complexity_of_variables = 1
|
1457 |
+
|
1458 |
# Data validation and feature name fetching via sklearn
|
1459 |
# This method sets the n_features_in_ attribute
|
1460 |
if Xresampled is not None:
|
|
|
1485 |
else:
|
1486 |
raise NotImplementedError("y shape not supported!")
|
1487 |
|
1488 |
+
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
|
1489 |
self.X_units_ = copy.deepcopy(X_units)
|
1490 |
self.y_units_ = copy.deepcopy(y_units)
|
1491 |
|
1492 |
+
return (
|
1493 |
+
X,
|
1494 |
+
y,
|
1495 |
+
Xresampled,
|
1496 |
+
weights,
|
1497 |
+
variable_names,
|
1498 |
+
complexity_of_variables,
|
1499 |
+
X_units,
|
1500 |
+
y_units,
|
1501 |
+
)
|
1502 |
|
1503 |
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1504 |
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
|
|
1514 |
y: ndarray,
|
1515 |
Xresampled: Union[ndarray, None],
|
1516 |
variable_names: ArrayLike[str],
|
1517 |
+
complexity_of_variables: Union[int, float, List[Union[int, float]]],
|
1518 |
X_units: Union[ArrayLike[str], None],
|
1519 |
y_units: Union[ArrayLike[str], str, None],
|
1520 |
random_state: np.random.RandomState,
|
|
|
1537 |
variable_names : list[str]
|
1538 |
Names of each variable in the training dataset, `X`.
|
1539 |
Of length `n_features`.
|
1540 |
+
complexity_of_variables : int | float | list[int | float]
|
1541 |
+
Complexity of each variable in the training dataset, `X`.
|
1542 |
X_units : list[str]
|
1543 |
Units of each variable in the training dataset, `X`.
|
1544 |
y_units : str | list[str]
|
|
|
1589 |
],
|
1590 |
)
|
1591 |
|
1592 |
+
if isinstance(complexity_of_variables, list):
|
1593 |
+
complexity_of_variables = [
|
1594 |
+
complexity_of_variables[i]
|
1595 |
+
for i in range(len(complexity_of_variables))
|
1596 |
+
if selection_mask[i]
|
1597 |
+
]
|
1598 |
+
self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
|
1599 |
+
|
1600 |
if X_units is not None:
|
1601 |
X_units = cast(
|
1602 |
ArrayLike[str],
|
|
|
1621 |
else:
|
1622 |
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
1623 |
|
1624 |
+
return X, y, variable_names, complexity_of_variables, X_units, y_units
|
1625 |
|
1626 |
def _run(
|
1627 |
self,
|
|
|
1678 |
|
1679 |
nested_constraints = self.nested_constraints
|
1680 |
complexity_of_operators = self.complexity_of_operators
|
1681 |
+
complexity_of_variables = self.complexity_of_variables_
|
1682 |
cluster_manager = self.cluster_manager
|
1683 |
|
1684 |
# Start julia backend processes
|
|
|
1723 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
1724 |
# TODO: Refactor this into helper function
|
1725 |
|
1726 |
+
if isinstance(complexity_of_variables, list):
|
1727 |
+
complexity_of_variables = jl_array(complexity_of_variables)
|
1728 |
+
|
1729 |
custom_loss = jl.seval(
|
1730 |
str(self.elementwise_loss)
|
1731 |
if self.elementwise_loss is not None
|
|
|
1784 |
una_constraints=jl_array(una_constraints),
|
1785 |
complexity_of_operators=complexity_of_operators,
|
1786 |
complexity_of_constants=self.complexity_of_constants,
|
1787 |
+
complexity_of_variables=complexity_of_variables,
|
1788 |
nested_constraints=nested_constraints,
|
1789 |
elementwise_loss=custom_loss,
|
1790 |
loss_function=custom_full_objective,
|
|
|
1929 |
Xresampled=None,
|
1930 |
weights=None,
|
1931 |
variable_names: Optional[ArrayLike[str]] = None,
|
1932 |
+
complexity_of_variables: Optional[
|
1933 |
+
Union[int, float, List[Union[int, float]]]
|
1934 |
+
] = None,
|
1935 |
X_units: Optional[ArrayLike[str]] = None,
|
1936 |
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
1937 |
) -> "PySRRegressor":
|
|
|
1992 |
self.selection_mask_ = None
|
1993 |
self.julia_state_stream_ = None
|
1994 |
self.julia_options_stream_ = None
|
1995 |
+
self.complexity_of_variables_ = None
|
1996 |
self.X_units_ = None
|
1997 |
self.y_units_ = None
|
1998 |
|
|
|
2006 |
Xresampled,
|
2007 |
weights,
|
2008 |
variable_names,
|
2009 |
+
complexity_of_variables,
|
2010 |
X_units,
|
2011 |
y_units,
|
2012 |
) = self._validate_and_set_fit_params(
|
2013 |
+
X,
|
2014 |
+
y,
|
2015 |
+
Xresampled,
|
2016 |
+
weights,
|
2017 |
+
variable_names,
|
2018 |
+
complexity_of_variables,
|
2019 |
+
X_units,
|
2020 |
+
y_units,
|
2021 |
)
|
2022 |
|
2023 |
if X.shape[0] > 10000 and not self.batching:
|
|
|
2035 |
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
|
2036 |
|
2037 |
# Pre transformations (feature selection and denoising)
|
2038 |
+
X, y, variable_names, complexity_of_variables, X_units, y_units = (
|
2039 |
+
self._pre_transform_training_data(
|
2040 |
+
X,
|
2041 |
+
y,
|
2042 |
+
Xresampled,
|
2043 |
+
variable_names,
|
2044 |
+
complexity_of_variables,
|
2045 |
+
X_units,
|
2046 |
+
y_units,
|
2047 |
+
random_state,
|
2048 |
+
)
|
2049 |
)
|
2050 |
|
2051 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
2072 |
X,
|
2073 |
use_custom_variable_names,
|
2074 |
variable_names,
|
2075 |
+
complexity_of_variables,
|
2076 |
weights,
|
2077 |
y,
|
2078 |
X_units,
|