MilesCranmer commited on
Commit
3a3b168
1 Parent(s): a548719

feat: allow list-like `complexity_of_variables`

Browse files
Files changed (1) hide show
  1. pysr/sr.py +98 -12
pysr/sr.py CHANGED
@@ -137,6 +137,7 @@ def _check_assertions(
137
  X,
138
  use_custom_variable_names,
139
  variable_names,
 
140
  weights,
141
  y,
142
  X_units,
@@ -161,6 +162,13 @@ def _check_assertions(
161
  "and underscores are allowed."
162
  )
163
  assert_valid_sympy_symbol(var_name)
 
 
 
 
 
 
 
164
  if X_units is not None and len(X_units) != X.shape[1]:
165
  raise ValueError(
166
  "The number of units in `X_units` must equal the number of features in `X`."
@@ -331,7 +339,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
331
  `idx` argument to the function, which is `nothing`
332
  for non-batched, and a 1D array of indices for batched.
333
  Default is `None`.
334
- complexity_of_operators : dict[str, float]
335
  If you would like to use a complexity other than 1 for an
336
  operator, specify the complexity here. For example,
337
  `{"sin": 2, "+": 1}` would give a complexity of 2 for each use
@@ -340,10 +348,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
340
  numbers for a complexity, and the total complexity of a tree
341
  will be rounded to the nearest integer after computing.
342
  Default is `None`.
343
- complexity_of_constants : float
344
  Complexity of constants. Default is `1`.
345
- complexity_of_variables : float
346
- Complexity of variables. Default is `1`.
 
 
 
347
  parsimony : float
348
  Multiplicative factor for how much to punish complexity.
349
  Default is `0.0032`.
@@ -689,6 +700,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
689
  n_features_in_: int
690
  feature_names_in_: ArrayLike[str]
691
  display_feature_names_in_: ArrayLike[str]
 
692
  X_units_: Union[ArrayLike[str], None]
693
  y_units_: Union[str, ArrayLike[str], None]
694
  nout_: int
@@ -720,7 +732,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
720
  loss_function: Optional[str] = None,
721
  complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
722
  complexity_of_constants: Union[int, float] = 1,
723
- complexity_of_variables: Union[int, float] = 1,
724
  parsimony: float = 0.0032,
725
  dimensional_constraint_penalty: Optional[float] = None,
726
  dimensionless_constants_only: bool = False,
@@ -1338,13 +1350,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1338
  return param_container
1339
 
1340
  def _validate_and_set_fit_params(
1341
- self, X, y, Xresampled, weights, variable_names, X_units, y_units
 
 
 
 
 
 
 
 
1342
  ) -> Tuple[
1343
  ndarray,
1344
  ndarray,
1345
  Optional[ndarray],
1346
  Optional[ndarray],
1347
  ArrayLike[str],
 
1348
  Optional[ArrayLike[str]],
1349
  Optional[Union[str, ArrayLike[str]]],
1350
  ]:
@@ -1369,6 +1390,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1369
  for that particular element of y.
1370
  variable_names : ndarray of length n_features
1371
  Names of each variable in the training dataset, `X`.
 
 
1372
  X_units : list[str] of length n_features
1373
  Units of each variable in the training dataset, `X`.
1374
  y_units : str | list[str] of length n_out
@@ -1416,6 +1439,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1416
  "Please use valid names instead."
1417
  )
1418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1419
  # Data validation and feature name fetching via sklearn
1420
  # This method sets the n_features_in_ attribute
1421
  if Xresampled is not None:
@@ -1446,10 +1485,20 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1446
  else:
1447
  raise NotImplementedError("y shape not supported!")
1448
 
 
1449
  self.X_units_ = copy.deepcopy(X_units)
1450
  self.y_units_ = copy.deepcopy(y_units)
1451
 
1452
- return X, y, Xresampled, weights, variable_names, X_units, y_units
 
 
 
 
 
 
 
 
 
1453
 
1454
  def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1455
  raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
@@ -1465,6 +1514,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1465
  y: ndarray,
1466
  Xresampled: Union[ndarray, None],
1467
  variable_names: ArrayLike[str],
 
1468
  X_units: Union[ArrayLike[str], None],
1469
  y_units: Union[ArrayLike[str], str, None],
1470
  random_state: np.random.RandomState,
@@ -1487,6 +1537,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1487
  variable_names : list[str]
1488
  Names of each variable in the training dataset, `X`.
1489
  Of length `n_features`.
 
 
1490
  X_units : list[str]
1491
  Units of each variable in the training dataset, `X`.
1492
  y_units : str | list[str]
@@ -1537,6 +1589,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1537
  ],
1538
  )
1539
 
 
 
 
 
 
 
 
 
1540
  if X_units is not None:
1541
  X_units = cast(
1542
  ArrayLike[str],
@@ -1561,7 +1621,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1561
  else:
1562
  X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1563
 
1564
- return X, y, variable_names, X_units, y_units
1565
 
1566
  def _run(
1567
  self,
@@ -1618,6 +1678,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1618
 
1619
  nested_constraints = self.nested_constraints
1620
  complexity_of_operators = self.complexity_of_operators
 
1621
  cluster_manager = self.cluster_manager
1622
 
1623
  # Start julia backend processes
@@ -1662,6 +1723,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1662
  complexity_of_operators = jl.seval(complexity_of_operators_str)
1663
  # TODO: Refactor this into helper function
1664
 
 
 
 
1665
  custom_loss = jl.seval(
1666
  str(self.elementwise_loss)
1667
  if self.elementwise_loss is not None
@@ -1720,7 +1784,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1720
  una_constraints=jl_array(una_constraints),
1721
  complexity_of_operators=complexity_of_operators,
1722
  complexity_of_constants=self.complexity_of_constants,
1723
- complexity_of_variables=self.complexity_of_variables,
1724
  nested_constraints=nested_constraints,
1725
  elementwise_loss=custom_loss,
1726
  loss_function=custom_full_objective,
@@ -1865,6 +1929,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1865
  Xresampled=None,
1866
  weights=None,
1867
  variable_names: Optional[ArrayLike[str]] = None,
 
 
 
1868
  X_units: Optional[ArrayLike[str]] = None,
1869
  y_units: Optional[Union[str, ArrayLike[str]]] = None,
1870
  ) -> "PySRRegressor":
@@ -1925,6 +1992,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1925
  self.selection_mask_ = None
1926
  self.julia_state_stream_ = None
1927
  self.julia_options_stream_ = None
 
1928
  self.X_units_ = None
1929
  self.y_units_ = None
1930
 
@@ -1938,10 +2006,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1938
  Xresampled,
1939
  weights,
1940
  variable_names,
 
1941
  X_units,
1942
  y_units,
1943
  ) = self._validate_and_set_fit_params(
1944
- X, y, Xresampled, weights, variable_names, X_units, y_units
 
 
 
 
 
 
 
1945
  )
1946
 
1947
  if X.shape[0] > 10000 and not self.batching:
@@ -1959,8 +2035,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1959
  seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
1960
 
1961
  # Pre transformations (feature selection and denoising)
1962
- X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
1963
- X, y, Xresampled, variable_names, X_units, y_units, random_state
 
 
 
 
 
 
 
 
 
1964
  )
1965
 
1966
  # Warn about large feature counts (still warn if feature count is large
@@ -1987,6 +2072,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1987
  X,
1988
  use_custom_variable_names,
1989
  variable_names,
 
1990
  weights,
1991
  y,
1992
  X_units,
 
137
  X,
138
  use_custom_variable_names,
139
  variable_names,
140
+ complexity_of_variables,
141
  weights,
142
  y,
143
  X_units,
 
162
  "and underscores are allowed."
163
  )
164
  assert_valid_sympy_symbol(var_name)
165
+ if (
166
+ isinstance(complexity_of_variables, list)
167
+ and len(complexity_of_variables) != X.shape[1]
168
+ ):
169
+ raise ValueError(
170
+ "The number of elements in `complexity_of_variables` must equal the number of features in `X`."
171
+ )
172
  if X_units is not None and len(X_units) != X.shape[1]:
173
  raise ValueError(
174
  "The number of units in `X_units` must equal the number of features in `X`."
 
339
  `idx` argument to the function, which is `nothing`
340
  for non-batched, and a 1D array of indices for batched.
341
  Default is `None`.
342
+ complexity_of_operators : dict[str, Union[int, float]]
343
  If you would like to use a complexity other than 1 for an
344
  operator, specify the complexity here. For example,
345
  `{"sin": 2, "+": 1}` would give a complexity of 2 for each use
 
348
  numbers for a complexity, and the total complexity of a tree
349
  will be rounded to the nearest integer after computing.
350
  Default is `None`.
351
+ complexity_of_constants : int | float
352
  Complexity of constants. Default is `1`.
353
+ complexity_of_variables : int | float
354
+ Global complexity of variables. To set different complexities for
355
+ different variables, pass a list of complexities to the `fit` method
356
+ with keyword `complexity_of_variables`. You cannot use both.
357
+ Default is `1`.
358
  parsimony : float
359
  Multiplicative factor for how much to punish complexity.
360
  Default is `0.0032`.
 
700
  n_features_in_: int
701
  feature_names_in_: ArrayLike[str]
702
  display_feature_names_in_: ArrayLike[str]
703
+ complexity_of_variables_: Union[int, float, List[Union[int, float]]]
704
  X_units_: Union[ArrayLike[str], None]
705
  y_units_: Union[str, ArrayLike[str], None]
706
  nout_: int
 
732
  loss_function: Optional[str] = None,
733
  complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
734
  complexity_of_constants: Union[int, float] = 1,
735
+ complexity_of_variables: Optional[Union[int, float]] = None,
736
  parsimony: float = 0.0032,
737
  dimensional_constraint_penalty: Optional[float] = None,
738
  dimensionless_constants_only: bool = False,
 
1350
  return param_container
1351
 
1352
  def _validate_and_set_fit_params(
1353
+ self,
1354
+ X,
1355
+ y,
1356
+ Xresampled,
1357
+ weights,
1358
+ variable_names,
1359
+ complexity_of_variables,
1360
+ X_units,
1361
+ y_units,
1362
  ) -> Tuple[
1363
  ndarray,
1364
  ndarray,
1365
  Optional[ndarray],
1366
  Optional[ndarray],
1367
  ArrayLike[str],
1368
+ Union[int, float, List[Union[int, float]]],
1369
  Optional[ArrayLike[str]],
1370
  Optional[Union[str, ArrayLike[str]]],
1371
  ]:
 
1390
  for that particular element of y.
1391
  variable_names : ndarray of length n_features
1392
  Names of each variable in the training dataset, `X`.
1393
+ complexity_of_variables : int | float | list[int | float]
1394
+ Complexity of each variable in the training dataset, `X`.
1395
  X_units : list[str] of length n_features
1396
  Units of each variable in the training dataset, `X`.
1397
  y_units : str | list[str] of length n_out
 
1439
  "Please use valid names instead."
1440
  )
1441
 
1442
+ if (
1443
+ complexity_of_variables is not None
1444
+ and self.complexity_of_variables is not None
1445
+ ):
1446
+ raise ValueError(
1447
+ "You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
1448
+ "Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
1449
+ "each variable individually."
1450
+ )
1451
+ elif complexity_of_variables is not None:
1452
+ complexity_of_variables = complexity_of_variables
1453
+ elif self.complexity_of_variables is not None:
1454
+ complexity_of_variables = self.complexity_of_variables
1455
+ else:
1456
+ complexity_of_variables = 1
1457
+
1458
  # Data validation and feature name fetching via sklearn
1459
  # This method sets the n_features_in_ attribute
1460
  if Xresampled is not None:
 
1485
  else:
1486
  raise NotImplementedError("y shape not supported!")
1487
 
1488
+ self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
1489
  self.X_units_ = copy.deepcopy(X_units)
1490
  self.y_units_ = copy.deepcopy(y_units)
1491
 
1492
+ return (
1493
+ X,
1494
+ y,
1495
+ Xresampled,
1496
+ weights,
1497
+ variable_names,
1498
+ complexity_of_variables,
1499
+ X_units,
1500
+ y_units,
1501
+ )
1502
 
1503
  def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1504
  raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
 
1514
  y: ndarray,
1515
  Xresampled: Union[ndarray, None],
1516
  variable_names: ArrayLike[str],
1517
+ complexity_of_variables: Union[int, float, List[Union[int, float]]],
1518
  X_units: Union[ArrayLike[str], None],
1519
  y_units: Union[ArrayLike[str], str, None],
1520
  random_state: np.random.RandomState,
 
1537
  variable_names : list[str]
1538
  Names of each variable in the training dataset, `X`.
1539
  Of length `n_features`.
1540
+ complexity_of_variables : int | float | list[int | float]
1541
+ Complexity of each variable in the training dataset, `X`.
1542
  X_units : list[str]
1543
  Units of each variable in the training dataset, `X`.
1544
  y_units : str | list[str]
 
1589
  ],
1590
  )
1591
 
1592
+ if isinstance(complexity_of_variables, list):
1593
+ complexity_of_variables = [
1594
+ complexity_of_variables[i]
1595
+ for i in range(len(complexity_of_variables))
1596
+ if selection_mask[i]
1597
+ ]
1598
+ self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
1599
+
1600
  if X_units is not None:
1601
  X_units = cast(
1602
  ArrayLike[str],
 
1621
  else:
1622
  X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1623
 
1624
+ return X, y, variable_names, complexity_of_variables, X_units, y_units
1625
 
1626
  def _run(
1627
  self,
 
1678
 
1679
  nested_constraints = self.nested_constraints
1680
  complexity_of_operators = self.complexity_of_operators
1681
+ complexity_of_variables = self.complexity_of_variables_
1682
  cluster_manager = self.cluster_manager
1683
 
1684
  # Start julia backend processes
 
1723
  complexity_of_operators = jl.seval(complexity_of_operators_str)
1724
  # TODO: Refactor this into helper function
1725
 
1726
+ if isinstance(complexity_of_variables, list):
1727
+ complexity_of_variables = jl_array(complexity_of_variables)
1728
+
1729
  custom_loss = jl.seval(
1730
  str(self.elementwise_loss)
1731
  if self.elementwise_loss is not None
 
1784
  una_constraints=jl_array(una_constraints),
1785
  complexity_of_operators=complexity_of_operators,
1786
  complexity_of_constants=self.complexity_of_constants,
1787
+ complexity_of_variables=complexity_of_variables,
1788
  nested_constraints=nested_constraints,
1789
  elementwise_loss=custom_loss,
1790
  loss_function=custom_full_objective,
 
1929
  Xresampled=None,
1930
  weights=None,
1931
  variable_names: Optional[ArrayLike[str]] = None,
1932
+ complexity_of_variables: Optional[
1933
+ Union[int, float, List[Union[int, float]]]
1934
+ ] = None,
1935
  X_units: Optional[ArrayLike[str]] = None,
1936
  y_units: Optional[Union[str, ArrayLike[str]]] = None,
1937
  ) -> "PySRRegressor":
 
1992
  self.selection_mask_ = None
1993
  self.julia_state_stream_ = None
1994
  self.julia_options_stream_ = None
1995
+ self.complexity_of_variables_ = None
1996
  self.X_units_ = None
1997
  self.y_units_ = None
1998
 
 
2006
  Xresampled,
2007
  weights,
2008
  variable_names,
2009
+ complexity_of_variables,
2010
  X_units,
2011
  y_units,
2012
  ) = self._validate_and_set_fit_params(
2013
+ X,
2014
+ y,
2015
+ Xresampled,
2016
+ weights,
2017
+ variable_names,
2018
+ complexity_of_variables,
2019
+ X_units,
2020
+ y_units,
2021
  )
2022
 
2023
  if X.shape[0] > 10000 and not self.batching:
 
2035
  seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
2036
 
2037
  # Pre transformations (feature selection and denoising)
2038
+ X, y, variable_names, complexity_of_variables, X_units, y_units = (
2039
+ self._pre_transform_training_data(
2040
+ X,
2041
+ y,
2042
+ Xresampled,
2043
+ variable_names,
2044
+ complexity_of_variables,
2045
+ X_units,
2046
+ y_units,
2047
+ random_state,
2048
+ )
2049
  )
2050
 
2051
  # Warn about large feature counts (still warn if feature count is large
 
2072
  X,
2073
  use_custom_variable_names,
2074
  variable_names,
2075
+ complexity_of_variables,
2076
  weights,
2077
  y,
2078
  X_units,