MilesCranmer commited on
Commit
fd4c500
1 Parent(s): b958ebf

fix: variety of typing information

Browse files
pysr/denoising.py CHANGED
@@ -1,9 +1,17 @@
1
  """Functions for denoising data during preprocessing."""
2
 
 
 
3
  import numpy as np
 
4
 
5
 
6
- def denoise(X, y, Xresampled=None, random_state=None):
 
 
 
 
 
7
  """Denoise the dataset using a Gaussian process."""
8
  from sklearn.gaussian_process import GaussianProcessRegressor
9
  from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
15
  gpr.fit(X, y)
16
 
17
  if Xresampled is not None:
18
- return Xresampled, gpr.predict(Xresampled)
19
 
20
- return X, gpr.predict(X)
21
 
22
 
23
- def multi_denoise(X, y, Xresampled=None, random_state=None):
 
 
 
 
 
24
  """Perform `denoise` along each column of `y` independently."""
25
  y = np.stack(
26
  [
 
1
  """Functions for denoising data during preprocessing."""
2
 
3
+ from typing import Optional, Tuple, cast
4
+
5
  import numpy as np
6
+ from numpy import ndarray
7
 
8
 
9
+ def denoise(
10
+ X: ndarray,
11
+ y: ndarray,
12
+ Xresampled: Optional[ndarray] = None,
13
+ random_state: Optional[np.random.RandomState] = None,
14
+ ) -> Tuple[ndarray, ndarray]:
15
  """Denoise the dataset using a Gaussian process."""
16
  from sklearn.gaussian_process import GaussianProcessRegressor
17
  from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
 
23
  gpr.fit(X, y)
24
 
25
  if Xresampled is not None:
26
+ return Xresampled, cast(ndarray, gpr.predict(Xresampled))
27
 
28
+ return X, cast(ndarray, gpr.predict(X))
29
 
30
 
31
+ def multi_denoise(
32
+ X: ndarray,
33
+ y: ndarray,
34
+ Xresampled: Optional[ndarray] = None,
35
+ random_state: Optional[np.random.RandomState] = None,
36
+ ):
37
  """Perform `denoise` along each column of `y` independently."""
38
  y = np.stack(
39
  [
pysr/feature_selection.py CHANGED
@@ -1,9 +1,20 @@
1
  """Functions for doing feature selection during preprocessing."""
2
 
 
 
3
  import numpy as np
 
 
 
 
4
 
5
 
6
- def run_feature_selection(X, y, select_k_features, random_state=None):
 
 
 
 
 
7
  """
8
  Find most important features.
9
 
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
21
  selector = SelectFromModel(
22
  clf, threshold=-np.inf, max_features=select_k_features, prefit=True
23
  )
24
- return selector.get_support(indices=True)
25
 
26
 
27
  # Function has not been removed only due to usage in module tests
28
- def _handle_feature_selection(X, select_k_features, y, variable_names):
 
 
 
 
 
29
  if select_k_features is not None:
30
  selection = run_feature_selection(X, y, select_k_features)
31
  print(f"Using features {[variable_names[i] for i in selection]}")
 
1
  """Functions for doing feature selection during preprocessing."""
2
 
3
+ from typing import Optional, cast
4
+
5
  import numpy as np
6
+ from numpy import ndarray
7
+ from numpy.typing import NDArray
8
+
9
+ from .utils import ArrayLike
10
 
11
 
12
+ def run_feature_selection(
13
+ X: ndarray,
14
+ y: ndarray,
15
+ select_k_features: int,
16
+ random_state: Optional[np.random.RandomState] = None,
17
+ ) -> NDArray[np.intp]:
18
  """
19
  Find most important features.
20
 
 
32
  selector = SelectFromModel(
33
  clf, threshold=-np.inf, max_features=select_k_features, prefit=True
34
  )
35
+ return cast(NDArray[np.intp], selector.get_support(indices=True))
36
 
37
 
38
  # Function has not been removed only due to usage in module tests
39
+ def _handle_feature_selection(
40
+ X: ndarray,
41
+ select_k_features: Optional[int],
42
+ y: ndarray,
43
+ variable_names: ArrayLike[str],
44
+ ):
45
  if select_k_features is not None:
46
  selection = run_feature_selection(X, y, select_k_features)
47
  print(f"Using features {[variable_names[i] for i in selection]}")
pysr/julia_helpers.py CHANGED
@@ -1,11 +1,15 @@
1
  """Functions for initializing the Julia environment and installing deps."""
2
 
 
 
3
  import numpy as np
4
  from juliacall import convert as jl_convert # type: ignore
5
 
6
  from .deprecated import init_julia, install
7
  from .julia_import import jl
8
 
 
 
9
  jl.seval("using Serialization: Serialization")
10
  jl.seval("using PythonCall: PythonCall")
11
 
 
1
  """Functions for initializing the Julia environment and installing deps."""
2
 
3
+ from typing import Any, Callable, cast
4
+
5
  import numpy as np
6
  from juliacall import convert as jl_convert # type: ignore
7
 
8
  from .deprecated import init_julia, install
9
  from .julia_import import jl
10
 
11
+ jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
12
+
13
  jl.seval("using Serialization: Serialization")
14
  jl.seval("using PythonCall: PythonCall")
15
 
pysr/julia_import.py CHANGED
@@ -1,7 +1,8 @@
1
  import os
2
  import sys
3
  import warnings
4
- from typing import Any
 
5
 
6
  # Check if JuliaCall is already loaded, and if so, warn the user
7
  # about the relevant environment variables. If not loaded,
@@ -43,7 +44,7 @@ if autoload_extensions is not None:
43
 
44
  from juliacall import Main as jl # type: ignore
45
 
46
- jl: Any = jl # type: ignore
47
 
48
 
49
  jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
 
1
  import os
2
  import sys
3
  import warnings
4
+ from types import ModuleType
5
+ from typing import cast
6
 
7
  # Check if JuliaCall is already loaded, and if so, warn the user
8
  # about the relevant environment variables. If not loaded,
 
44
 
45
  from juliacall import Main as jl # type: ignore
46
 
47
+ jl = cast(ModuleType, jl)
48
 
49
 
50
  jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
pysr/sr.py CHANGED
@@ -679,7 +679,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
679
  X_units_: Optional[ArrayLike[str]]
680
  y_units_: Optional[Union[str, ArrayLike[str]]]
681
  nout_: int
682
- selection_mask_: Optional[NDArray[np.bool_]]
683
  tempdir_: Path
684
  equation_file_: Union[str, Path]
685
  julia_state_stream_: Optional[NDArray[np.uint8]]
@@ -921,12 +921,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
921
  cls,
922
  equation_file,
923
  *,
924
- binary_operators=None,
925
- unary_operators=None,
926
- n_features_in=None,
927
- feature_names_in=None,
928
- selection_mask=None,
929
- nout=1,
930
  **pysr_kwargs,
931
  ):
932
  """
@@ -949,7 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
949
  feature_names_in : list[str]
950
  Names of the features passed to the model.
951
  Not needed if loading from a pickle file.
952
- selection_mask : list[bool]
953
  If using `select_k_features`, you must pass `model.selection_mask_` here.
954
  Not needed if loading from a pickle file.
955
  nout : int
@@ -1021,7 +1021,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1021
  model.display_feature_names_in_ = feature_names_in
1022
 
1023
  if selection_mask is None:
1024
- model.selection_mask_ = np.ones(n_features_in, dtype=bool)
1025
  else:
1026
  model.selection_mask_ = selection_mask
1027
 
@@ -1197,19 +1197,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1197
  ), "With multiple output features, index must be a list."
1198
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1199
  elif isinstance(self.equations_, pd.DataFrame):
1200
- return self.equations_.iloc[index]
1201
  else:
1202
  raise ValueError("No equations have been generated yet.")
1203
 
1204
  if isinstance(self.equations_, list):
1205
  return [
1206
- eq.loc[idx_model_selection(eq, self.model_selection)]
1207
  for eq in self.equations_
1208
  ]
1209
  elif isinstance(self.equations_, pd.DataFrame):
1210
- return self.equations_.loc[
1211
- idx_model_selection(self.equations_, self.model_selection)
1212
- ]
 
 
 
1213
  else:
1214
  raise ValueError("No equations have been generated yet.")
1215
 
@@ -1351,7 +1354,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1351
  ndarray,
1352
  Optional[ndarray],
1353
  Optional[ndarray],
1354
- ndarray,
1355
  Optional[ArrayLike[str]],
1356
  Optional[Union[str, ArrayLike[str]]],
1357
  ]:
@@ -1459,13 +1462,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1459
  return X, y, Xresampled, weights, variable_names, X_units, y_units
1460
 
1461
  def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1462
- return self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
 
1463
 
1464
  def _validate_data_X(self, X) -> Tuple[ndarray]:
1465
- return self._validate_data(X=X, reset=False) # type: ignore
 
1466
 
1467
  def _pre_transform_training_data(
1468
- self, X, y, Xresampled, variable_names, X_units, y_units, random_state
 
 
 
 
 
 
 
1469
  ):
1470
  """
1471
  Transform the training data before fitting the symbolic regressor.
@@ -1474,12 +1486,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1474
 
1475
  Parameters
1476
  ----------
1477
- X : ndarray | pandas.DataFrame
1478
  Training data of shape (n_samples, n_features).
1479
- y : ndarray | pandas.DataFrame
1480
  Target values of shape (n_samples,) or (n_samples, n_targets).
1481
  Will be cast to X's dtype if necessary.
1482
- Xresampled : ndarray | pandas.DataFrame
1483
  Resampled training data, of shape `(n_resampled, n_features)`,
1484
  used for denoising.
1485
  variable_names : list[str]
@@ -1517,24 +1529,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1517
  """
1518
  # Feature selection transformation
1519
  if self.select_k_features:
1520
- self.selection_mask_ = run_feature_selection(
1521
  X, y, self.select_k_features, random_state=random_state
1522
  )
1523
- X = X[:, self.selection_mask_]
1524
 
1525
  if Xresampled is not None:
1526
- Xresampled = Xresampled[:, self.selection_mask_]
1527
 
1528
  # Reduce variable_names to selection
1529
- variable_names = [variable_names[i] for i in self.selection_mask_]
 
 
1530
 
1531
  if X_units is not None:
1532
- X_units = [X_units[i] for i in self.selection_mask_]
1533
  self.X_units_ = copy.deepcopy(X_units)
1534
 
1535
  # Re-perform data validation and feature name updating
1536
  X, y = self._validate_data_X_y(X, y)
1537
  # Update feature names with selected variable names
 
1538
  self.feature_names_in_ = _check_feature_names_in(self, variable_names)
1539
  self.display_feature_names_in_ = self.feature_names_in_
1540
  print(f"Using features {self.feature_names_in_}")
 
679
  X_units_: Optional[ArrayLike[str]]
680
  y_units_: Optional[Union[str, ArrayLike[str]]]
681
  nout_: int
682
+ selection_mask_: Optional[NDArray[np.intp]]
683
  tempdir_: Path
684
  equation_file_: Union[str, Path]
685
  julia_state_stream_: Optional[NDArray[np.uint8]]
 
921
  cls,
922
  equation_file,
923
  *,
924
+ binary_operators: Optional[List[str]] = None,
925
+ unary_operators: Optional[List[str]] = None,
926
+ n_features_in: Optional[int] = None,
927
+ feature_names_in: Optional[ArrayLike[str]] = None,
928
+ selection_mask: Optional[NDArray[np.intp]] = None,
929
+ nout: int = 1,
930
  **pysr_kwargs,
931
  ):
932
  """
 
949
  feature_names_in : list[str]
950
  Names of the features passed to the model.
951
  Not needed if loading from a pickle file.
952
+ selection_mask : NDArray[np.intp]
953
  If using `select_k_features`, you must pass `model.selection_mask_` here.
954
  Not needed if loading from a pickle file.
955
  nout : int
 
1021
  model.display_feature_names_in_ = feature_names_in
1022
 
1023
  if selection_mask is None:
1024
+ model.selection_mask_ = np.arange(n_features_in, dtype=np.intp)
1025
  else:
1026
  model.selection_mask_ = selection_mask
1027
 
 
1197
  ), "With multiple output features, index must be a list."
1198
  return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
1199
  elif isinstance(self.equations_, pd.DataFrame):
1200
+ return cast(pd.Series, self.equations_.iloc[index])
1201
  else:
1202
  raise ValueError("No equations have been generated yet.")
1203
 
1204
  if isinstance(self.equations_, list):
1205
  return [
1206
+ cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
1207
  for eq in self.equations_
1208
  ]
1209
  elif isinstance(self.equations_, pd.DataFrame):
1210
+ return cast(
1211
+ pd.Series,
1212
+ self.equations_.loc[
1213
+ idx_model_selection(self.equations_, self.model_selection)
1214
+ ],
1215
+ )
1216
  else:
1217
  raise ValueError("No equations have been generated yet.")
1218
 
 
1354
  ndarray,
1355
  Optional[ndarray],
1356
  Optional[ndarray],
1357
+ ArrayLike[str],
1358
  Optional[ArrayLike[str]],
1359
  Optional[Union[str, ArrayLike[str]]],
1360
  ]:
 
1462
  return X, y, Xresampled, weights, variable_names, X_units, y_units
1463
 
1464
  def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1465
+ raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
1466
+ return cast(Tuple[ndarray, ndarray], raw_out)
1467
 
1468
  def _validate_data_X(self, X) -> Tuple[ndarray]:
1469
+ raw_out = self._validate_data(X=X, reset=False) # type: ignore
1470
+ return cast(Tuple[ndarray], raw_out)
1471
 
1472
  def _pre_transform_training_data(
1473
+ self,
1474
+ X: ndarray,
1475
+ y: ndarray,
1476
+ Xresampled: Union[ndarray, None],
1477
+ variable_names: ArrayLike[str],
1478
+ X_units: Union[ArrayLike[str], None],
1479
+ y_units: Union[ArrayLike[str], str, None],
1480
+ random_state: np.random.RandomState,
1481
  ):
1482
  """
1483
  Transform the training data before fitting the symbolic regressor.
 
1486
 
1487
  Parameters
1488
  ----------
1489
+ X : ndarray
1490
  Training data of shape (n_samples, n_features).
1491
+ y : ndarray
1492
  Target values of shape (n_samples,) or (n_samples, n_targets).
1493
  Will be cast to X's dtype if necessary.
1494
+ Xresampled : ndarray | None
1495
  Resampled training data, of shape `(n_resampled, n_features)`,
1496
  used for denoising.
1497
  variable_names : list[str]
 
1529
  """
1530
  # Feature selection transformation
1531
  if self.select_k_features:
1532
+ selection_mask = run_feature_selection(
1533
  X, y, self.select_k_features, random_state=random_state
1534
  )
1535
+ X = X[:, selection_mask]
1536
 
1537
  if Xresampled is not None:
1538
+ Xresampled = Xresampled[:, selection_mask]
1539
 
1540
  # Reduce variable_names to selection
1541
+ variable_names = cast(
1542
+ ArrayLike[str], [variable_names[i] for i in selection_mask]
1543
+ )
1544
 
1545
  if X_units is not None:
1546
+ X_units = cast(ArrayLike[str], [X_units[i] for i in selection_mask])
1547
  self.X_units_ = copy.deepcopy(X_units)
1548
 
1549
  # Re-perform data validation and feature name updating
1550
  X, y = self._validate_data_X_y(X, y)
1551
  # Update feature names with selected variable names
1552
+ self.selection_mask_ = selection_mask
1553
  self.feature_names_in_ = _check_feature_names_in(self, variable_names)
1554
  self.display_feature_names_in_ = self.feature_names_in_
1555
  print(f"Using features {self.feature_names_in_}")