MilesCranmer commited on
Commit
7909e90
1 Parent(s): 2bd7782

refactor: more type declarations

Browse files
Files changed (1) hide show
  1. pysr/sr.py +51 -22
pysr/sr.py CHANGED
@@ -21,9 +21,12 @@ else:
21
 
22
  import numpy as np
23
  import pandas as pd
 
 
24
  from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
25
  from sklearn.utils import check_array, check_consistent_length, check_random_state
26
- from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
 
27
 
28
  from .denoising import denoise, multi_denoise
29
  from .deprecated import DEPRECATED_KWARGS
@@ -179,6 +182,21 @@ VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
179
 
180
 
181
  class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  """
183
  High-performance symbolic regression algorithm.
184
 
@@ -603,22 +621,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
603
  Units of each variable in the training dataset, `y`.
604
  nout_ : int
605
  Number of output dimensions.
606
- selection_mask_ : list[int] of length `select_k_features`
607
- List of indices for input features that are selected when
608
- `select_k_features` is set.
609
  tempdir_ : Path
610
  Path to the temporary equations directory.
611
- equation_file_ : str
612
  Output equation file name produced by the julia backend.
613
  julia_state_stream_ : ndarray
614
  The serialized state for the julia SymbolicRegression.jl backend (after fitting),
615
  stored as an array of uint8, produced by Julia's Serialization.serialize function.
616
- julia_state_
617
- The deserialized state.
618
  julia_options_stream_ : ndarray
619
  The serialized julia options, stored as an array of uint8,
620
- julia_options_
621
- The deserialized julia options.
622
  equation_file_contents_ : list[pandas.DataFrame]
623
  Contents of the equation file output by the Julia backend.
624
  show_pickle_warnings_ : bool
@@ -926,7 +939,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
926
  Names of the features passed to the model.
927
  Not needed if loading from a pickle file.
928
  selection_mask : list[bool]
929
- If using select_k_features, you must pass `model.selection_mask_` here.
930
  Not needed if loading from a pickle file.
931
  nout : int
932
  Number of outputs of the model.
@@ -1124,10 +1137,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1124
 
1125
  @property
1126
  def julia_options_(self):
 
1127
  return jl_deserialize(self.julia_options_stream_)
1128
 
1129
  @property
1130
  def julia_state_(self):
 
1131
  return jl_deserialize(self.julia_state_stream_)
1132
 
1133
  @property
@@ -1140,7 +1155,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1140
  )
1141
  return self.julia_state_
1142
 
1143
- def get_best(self, index=None):
1144
  """
1145
  Get best equation using `model_selection`.
1146
 
@@ -1316,7 +1331,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1316
 
1317
  def _validate_and_set_fit_params(
1318
  self, X, y, Xresampled, weights, variable_names, X_units, y_units
1319
- ):
 
 
 
 
 
 
 
 
1320
  """
1321
  Validate the parameters passed to the :term`fit` method.
1322
 
@@ -1336,7 +1359,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1336
  Weight array of the same shape as `y`.
1337
  Each element is how to weight the mean-square-error loss
1338
  for that particular element of y.
1339
- variable_names : list[str] of length n_features
1340
  Names of each variable in the training dataset, `X`.
1341
  X_units : list[str] of length n_features
1342
  Units of each variable in the training dataset, `X`.
@@ -1392,7 +1415,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1392
  if weights is not None:
1393
  weights = check_array(weights, ensure_2d=False)
1394
  check_consistent_length(weights, y)
1395
- X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1396
  self.feature_names_in_ = _safe_check_feature_names_in(
1397
  self, variable_names, generate_names=False
1398
  )
@@ -1402,10 +1425,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1402
  self.display_feature_names_in_ = np.array(
1403
  [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
1404
  )
 
1405
  else:
1406
  self.display_feature_names_in_ = self.feature_names_in_
1407
-
1408
- variable_names = self.feature_names_in_
1409
 
1410
  # Handle multioutput data
1411
  if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
@@ -1420,6 +1443,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1420
 
1421
  return X, y, Xresampled, weights, variable_names, X_units, y_units
1422
 
 
 
 
 
 
 
1423
  def _pre_transform_training_data(
1424
  self, X, y, Xresampled, variable_names, X_units, y_units, random_state
1425
  ):
@@ -1489,7 +1518,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1489
  self.X_units_ = copy.deepcopy(X_units)
1490
 
1491
  # Re-perform data validation and feature name updating
1492
- X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1493
  # Update feature names with selected variable names
1494
  self.feature_names_in_ = _check_feature_names_in(self, variable_names)
1495
  self.display_feature_names_in_ = self.feature_names_in_
@@ -1506,7 +1535,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1506
 
1507
  return X, y, variable_names, X_units, y_units
1508
 
1509
- def _run(self, X, y, mutated_params, weights, seed):
1510
  """
1511
  Run the symbolic regression fitting process on the julia backend.
1512
 
@@ -1784,9 +1813,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1784
  y,
1785
  Xresampled=None,
1786
  weights=None,
1787
- variable_names: Optional[List[str]] = None,
1788
- X_units: Optional[List[str]] = None,
1789
- y_units: Optional[List[str]] = None,
1790
  ) -> "PySRRegressor":
1791
  """
1792
  Search for equations to fit the dataset and store them in `self.equations_`.
@@ -2003,7 +2032,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2003
  # reordered/reindexed to match those of the transformed (denoised and
2004
  # feature selected) X in fit.
2005
  X = X.reindex(columns=self.feature_names_in_)
2006
- X = self._validate_data(X, reset=False)
2007
 
2008
  try:
2009
  if isinstance(best_equation, list):
 
21
 
22
  import numpy as np
23
  import pandas as pd
24
+ from numpy import ndarray
25
+ from numpy.typing import NDArray
26
  from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
27
  from sklearn.utils import check_array, check_consistent_length, check_random_state
28
+ from sklearn.utils.validation import _check_feature_names_in # type: ignore
29
+ from sklearn.utils.validation import check_is_fitted
30
 
31
  from .denoising import denoise, multi_denoise
32
  from .deprecated import DEPRECATED_KWARGS
 
182
 
183
 
184
  class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
185
+ equations_: Optional[Union[pd.DataFrame, List[pd.DataFrame]]]
186
+ n_features_in_: int
187
+ feature_names_in_: ArrayLike[str]
188
+ display_feature_names_in_: ArrayLike[str]
189
+ X_units_: Optional[ArrayLike[str]]
190
+ y_units_: Optional[Union[str, ArrayLike[str]]]
191
+ nout_: int
192
+ selection_mask_: Optional[NDArray[np.bool_]]
193
+ tempdir_: Path
194
+ equation_file_: Union[str, Path]
195
+ julia_state_stream_: Optional[NDArray[np.uint8]]
196
+ julia_options_stream_: Optional[NDArray[np.uint8]]
197
+ equation_file_contents_: Optional[List[pd.DataFrame]]
198
+ show_pickle_warnings_: bool
199
+
200
  """
201
  High-performance symbolic regression algorithm.
202
 
 
621
  Units of each variable in the training dataset, `y`.
622
  nout_ : int
623
  Number of output dimensions.
624
+ selection_mask_ : ndarray of shape (`n_features_in_`,)
625
+ Mask of which features of `X` to use when `select_k_features` is set.
 
626
  tempdir_ : Path
627
  Path to the temporary equations directory.
628
+ equation_file_ : Union[str, Path]
629
  Output equation file name produced by the julia backend.
630
  julia_state_stream_ : ndarray
631
  The serialized state for the julia SymbolicRegression.jl backend (after fitting),
632
  stored as an array of uint8, produced by Julia's Serialization.serialize function.
 
 
633
  julia_options_stream_ : ndarray
634
  The serialized julia options, stored as an array of uint8,
 
 
635
  equation_file_contents_ : list[pandas.DataFrame]
636
  Contents of the equation file output by the Julia backend.
637
  show_pickle_warnings_ : bool
 
939
  Names of the features passed to the model.
940
  Not needed if loading from a pickle file.
941
  selection_mask : list[bool]
942
+ If using `select_k_features`, you must pass `model.selection_mask_` here.
943
  Not needed if loading from a pickle file.
944
  nout : int
945
  Number of outputs of the model.
 
1137
 
1138
  @property
1139
  def julia_options_(self):
1140
+ """The deserialized julia options."""
1141
  return jl_deserialize(self.julia_options_stream_)
1142
 
1143
  @property
1144
  def julia_state_(self):
1145
+ """The deserialized state."""
1146
  return jl_deserialize(self.julia_state_stream_)
1147
 
1148
  @property
 
1155
  )
1156
  return self.julia_state_
1157
 
1158
+ def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
1159
  """
1160
  Get best equation using `model_selection`.
1161
 
 
1331
 
1332
  def _validate_and_set_fit_params(
1333
  self, X, y, Xresampled, weights, variable_names, X_units, y_units
1334
+ ) -> Tuple[
1335
+ ndarray,
1336
+ ndarray,
1337
+ Optional[ndarray],
1338
+ Optional[ndarray],
1339
+ ndarray,
1340
+ Optional[ArrayLike[str]],
1341
+ Optional[Union[str, ArrayLike[str]]],
1342
+ ]:
1343
  """
1344
  Validate the parameters passed to the :term`fit` method.
1345
 
 
1359
  Weight array of the same shape as `y`.
1360
  Each element is how to weight the mean-square-error loss
1361
  for that particular element of y.
1362
+ variable_names : ndarray of length n_features
1363
  Names of each variable in the training dataset, `X`.
1364
  X_units : list[str] of length n_features
1365
  Units of each variable in the training dataset, `X`.
 
1415
  if weights is not None:
1416
  weights = check_array(weights, ensure_2d=False)
1417
  check_consistent_length(weights, y)
1418
+ X, y = self._validate_data_X_y(X, y)
1419
  self.feature_names_in_ = _safe_check_feature_names_in(
1420
  self, variable_names, generate_names=False
1421
  )
 
1425
  self.display_feature_names_in_ = np.array(
1426
  [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
1427
  )
1428
+ variable_names = self.feature_names_in_
1429
  else:
1430
  self.display_feature_names_in_ = self.feature_names_in_
1431
+ variable_names = self.feature_names_in_
 
1432
 
1433
  # Handle multioutput data
1434
  if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
 
1443
 
1444
  return X, y, Xresampled, weights, variable_names, X_units, y_units
1445
 
1446
+ def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1447
+ return self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
1448
+
1449
+ def _validate_data_X(self, X) -> Tuple[ndarray]:
1450
+ return self._validate_data(X=X, reset=False) # type: ignore
1451
+
1452
  def _pre_transform_training_data(
1453
  self, X, y, Xresampled, variable_names, X_units, y_units, random_state
1454
  ):
 
1518
  self.X_units_ = copy.deepcopy(X_units)
1519
 
1520
  # Re-perform data validation and feature name updating
1521
+ X, y = self._validate_data_X_y(X, y)
1522
  # Update feature names with selected variable names
1523
  self.feature_names_in_ = _check_feature_names_in(self, variable_names)
1524
  self.display_feature_names_in_ = self.feature_names_in_
 
1535
 
1536
  return X, y, variable_names, X_units, y_units
1537
 
1538
+ def _run(self, X, y, mutated_params, weights, seed: int):
1539
  """
1540
  Run the symbolic regression fitting process on the julia backend.
1541
 
 
1813
  y,
1814
  Xresampled=None,
1815
  weights=None,
1816
+ variable_names: Optional[ArrayLike[str]] = None,
1817
+ X_units: Optional[ArrayLike[str]] = None,
1818
+ y_units: Optional[Union[str, ArrayLike[str]]] = None,
1819
  ) -> "PySRRegressor":
1820
  """
1821
  Search for equations to fit the dataset and store them in `self.equations_`.
 
2032
  # reordered/reindexed to match those of the transformed (denoised and
2033
  # feature selected) X in fit.
2034
  X = X.reindex(columns=self.feature_names_in_)
2035
+ X = self._validate_data_X(X)
2036
 
2037
  try:
2038
  if isinstance(best_equation, list):