Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on May 5, 2024

Commit

fd4c500

unverified ·

1 Parent(s): b958ebf

fix: variety of typing information

Browse files

Files changed (5) hide show

pysr/denoising.py +17 -4
pysr/feature_selection.py +19 -3
pysr/julia_helpers.py +4 -0
pysr/julia_import.py +3 -2
pysr/sr.py +41 -26

pysr/denoising.py CHANGED Viewed

@@ -1,9 +1,17 @@
 """Functions for denoising data during preprocessing."""
 import numpy as np
-def denoise(X, y, Xresampled=None, random_state=None):
     """Denoise the dataset using a Gaussian process."""
     from sklearn.gaussian_process import GaussianProcessRegressor
     from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
     gpr.fit(X, y)
     if Xresampled is not None:
-        return Xresampled, gpr.predict(Xresampled)
-    return X, gpr.predict(X)
-def multi_denoise(X, y, Xresampled=None, random_state=None):
     """Perform `denoise` along each column of `y` independently."""
     y = np.stack(
         [

 """Functions for denoising data during preprocessing."""
+from typing import Optional, Tuple, cast
 import numpy as np
+from numpy import ndarray
+def denoise(
+    X: ndarray,
+    y: ndarray,
+    Xresampled: Optional[ndarray] = None,
+    random_state: Optional[np.random.RandomState] = None,
+) -> Tuple[ndarray, ndarray]:
     """Denoise the dataset using a Gaussian process."""
     from sklearn.gaussian_process import GaussianProcessRegressor
     from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
     gpr.fit(X, y)
     if Xresampled is not None:
+        return Xresampled, cast(ndarray, gpr.predict(Xresampled))
+    return X, cast(ndarray, gpr.predict(X))
+def multi_denoise(
+    X: ndarray,
+    y: ndarray,
+    Xresampled: Optional[ndarray] = None,
+    random_state: Optional[np.random.RandomState] = None,
+):
     """Perform `denoise` along each column of `y` independently."""
     y = np.stack(
         [

pysr/feature_selection.py CHANGED Viewed

@@ -1,9 +1,20 @@
 """Functions for doing feature selection during preprocessing."""
 import numpy as np
-def run_feature_selection(X, y, select_k_features, random_state=None):
     """
     Find most important features.
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
     selector = SelectFromModel(
         clf, threshold=-np.inf, max_features=select_k_features, prefit=True
     )
-    return selector.get_support(indices=True)
 # Function has not been removed only due to usage in module tests
-def _handle_feature_selection(X, select_k_features, y, variable_names):
     if select_k_features is not None:
         selection = run_feature_selection(X, y, select_k_features)
         print(f"Using features {[variable_names[i] for i in selection]}")

 """Functions for doing feature selection during preprocessing."""
+from typing import Optional, cast
 import numpy as np
+from numpy import ndarray
+from numpy.typing import NDArray
+from .utils import ArrayLike
+def run_feature_selection(
+    X: ndarray,
+    y: ndarray,
+    select_k_features: int,
+    random_state: Optional[np.random.RandomState] = None,
+) -> NDArray[np.intp]:
     """
     Find most important features.
     selector = SelectFromModel(
         clf, threshold=-np.inf, max_features=select_k_features, prefit=True
     )
+    return cast(NDArray[np.intp], selector.get_support(indices=True))
 # Function has not been removed only due to usage in module tests
+def _handle_feature_selection(
+    X: ndarray,
+    select_k_features: Optional[int],
+    y: ndarray,
+    variable_names: ArrayLike[str],
+):
     if select_k_features is not None:
         selection = run_feature_selection(X, y, select_k_features)
         print(f"Using features {[variable_names[i] for i in selection]}")

pysr/julia_helpers.py CHANGED Viewed

@@ -1,11 +1,15 @@
 """Functions for initializing the Julia environment and installing deps."""
 import numpy as np
 from juliacall import convert as jl_convert  # type: ignore
 from .deprecated import init_julia, install
 from .julia_import import jl
 jl.seval("using Serialization: Serialization")
 jl.seval("using PythonCall: PythonCall")

 """Functions for initializing the Julia environment and installing deps."""
+from typing import Any, Callable, cast
 import numpy as np
 from juliacall import convert as jl_convert  # type: ignore
 from .deprecated import init_julia, install
 from .julia_import import jl
+jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
 jl.seval("using Serialization: Serialization")
 jl.seval("using PythonCall: PythonCall")

pysr/julia_import.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 import sys
 import warnings
-from typing import Any
 # Check if JuliaCall is already loaded, and if so, warn the user
 # about the relevant environment variables. If not loaded,
@@ -43,7 +44,7 @@ if autoload_extensions is not None:
 from juliacall import Main as jl  # type: ignore
-jl: Any = jl  # type: ignore
 jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)

 import os
 import sys
 import warnings
+from types import ModuleType
+from typing import cast
 # Check if JuliaCall is already loaded, and if so, warn the user
 # about the relevant environment variables. If not loaded,
 from juliacall import Main as jl  # type: ignore
+jl = cast(ModuleType, jl)
 jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)

pysr/sr.py CHANGED Viewed

@@ -679,7 +679,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     X_units_: Optional[ArrayLike[str]]
     y_units_: Optional[Union[str, ArrayLike[str]]]
     nout_: int
-    selection_mask_: Optional[NDArray[np.bool_]]
     tempdir_: Path
     equation_file_: Union[str, Path]
     julia_state_stream_: Optional[NDArray[np.uint8]]
@@ -921,12 +921,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         cls,
         equation_file,
         *,
-        binary_operators=None,
-        unary_operators=None,
-        n_features_in=None,
-        feature_names_in=None,
-        selection_mask=None,
-        nout=1,
         **pysr_kwargs,
     ):
         """
@@ -949,7 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         feature_names_in : list[str]
             Names of the features passed to the model.
             Not needed if loading from a pickle file.
-        selection_mask : list[bool]
             If using `select_k_features`, you must pass `model.selection_mask_` here.
             Not needed if loading from a pickle file.
         nout : int
@@ -1021,7 +1021,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             model.display_feature_names_in_ = feature_names_in
         if selection_mask is None:
-            model.selection_mask_ = np.ones(n_features_in, dtype=bool)
         else:
             model.selection_mask_ = selection_mask
@@ -1197,19 +1197,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 ), "With multiple output features, index must be a list."
                 return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
             elif isinstance(self.equations_, pd.DataFrame):
-                return self.equations_.iloc[index]
             else:
                 raise ValueError("No equations have been generated yet.")
         if isinstance(self.equations_, list):
             return [
-                eq.loc[idx_model_selection(eq, self.model_selection)]
                 for eq in self.equations_
             ]
         elif isinstance(self.equations_, pd.DataFrame):
-            return self.equations_.loc[
-                idx_model_selection(self.equations_, self.model_selection)
-            ]
         else:
             raise ValueError("No equations have been generated yet.")
@@ -1351,7 +1354,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         ndarray,
         Optional[ndarray],
         Optional[ndarray],
-        ndarray,
         Optional[ArrayLike[str]],
         Optional[Union[str, ArrayLike[str]]],
     ]:
@@ -1459,13 +1462,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return X, y, Xresampled, weights, variable_names, X_units, y_units
     def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
-        return self._validate_data(X=X, y=y, reset=True, multi_output=True)  # type: ignore
     def _validate_data_X(self, X) -> Tuple[ndarray]:
-        return self._validate_data(X=X, reset=False)  # type: ignore
     def _pre_transform_training_data(
-        self, X, y, Xresampled, variable_names, X_units, y_units, random_state
     ):
         """
         Transform the training data before fitting the symbolic regressor.
@@ -1474,12 +1486,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Parameters
         ----------
-        X : ndarray | pandas.DataFrame
             Training data of shape (n_samples, n_features).
-        y : ndarray | pandas.DataFrame
             Target values of shape (n_samples,) or (n_samples, n_targets).
             Will be cast to X's dtype if necessary.
-        Xresampled : ndarray | pandas.DataFrame
             Resampled training data, of shape `(n_resampled, n_features)`,
             used for denoising.
         variable_names : list[str]
@@ -1517,24 +1529,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         """
         # Feature selection transformation
         if self.select_k_features:
-            self.selection_mask_ = run_feature_selection(
                 X, y, self.select_k_features, random_state=random_state
             )
-            X = X[:, self.selection_mask_]
             if Xresampled is not None:
-                Xresampled = Xresampled[:, self.selection_mask_]
             # Reduce variable_names to selection
-            variable_names = [variable_names[i] for i in self.selection_mask_]
             if X_units is not None:
-                X_units = [X_units[i] for i in self.selection_mask_]
                 self.X_units_ = copy.deepcopy(X_units)
             # Re-perform data validation and feature name updating
             X, y = self._validate_data_X_y(X, y)
             # Update feature names with selected variable names
             self.feature_names_in_ = _check_feature_names_in(self, variable_names)
             self.display_feature_names_in_ = self.feature_names_in_
             print(f"Using features {self.feature_names_in_}")

     X_units_: Optional[ArrayLike[str]]
     y_units_: Optional[Union[str, ArrayLike[str]]]
     nout_: int
+    selection_mask_: Optional[NDArray[np.intp]]
     tempdir_: Path
     equation_file_: Union[str, Path]
     julia_state_stream_: Optional[NDArray[np.uint8]]
         cls,
         equation_file,
         *,
+        binary_operators: Optional[List[str]] = None,
+        unary_operators: Optional[List[str]] = None,
+        n_features_in: Optional[int] = None,
+        feature_names_in: Optional[ArrayLike[str]] = None,
+        selection_mask: Optional[NDArray[np.intp]] = None,
+        nout: int = 1,
         **pysr_kwargs,
     ):
         """
         feature_names_in : list[str]
             Names of the features passed to the model.
             Not needed if loading from a pickle file.
+        selection_mask : NDArray[np.intp]
             If using `select_k_features`, you must pass `model.selection_mask_` here.
             Not needed if loading from a pickle file.
         nout : int
             model.display_feature_names_in_ = feature_names_in
         if selection_mask is None:
+            model.selection_mask_ = np.arange(n_features_in, dtype=np.intp)
         else:
             model.selection_mask_ = selection_mask
                 ), "With multiple output features, index must be a list."
                 return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
             elif isinstance(self.equations_, pd.DataFrame):
+                return cast(pd.Series, self.equations_.iloc[index])
             else:
                 raise ValueError("No equations have been generated yet.")
         if isinstance(self.equations_, list):
             return [
+                cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
                 for eq in self.equations_
             ]
         elif isinstance(self.equations_, pd.DataFrame):
+            return cast(
+                pd.Series,
+                self.equations_.loc[
+                    idx_model_selection(self.equations_, self.model_selection)
+                ],
+            )
         else:
             raise ValueError("No equations have been generated yet.")
         ndarray,
         Optional[ndarray],
         Optional[ndarray],
+        ArrayLike[str],
         Optional[ArrayLike[str]],
         Optional[Union[str, ArrayLike[str]]],
     ]:
         return X, y, Xresampled, weights, variable_names, X_units, y_units
     def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
+        raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True)  # type: ignore
+        return cast(Tuple[ndarray, ndarray], raw_out)
     def _validate_data_X(self, X) -> Tuple[ndarray]:
+        raw_out = self._validate_data(X=X, reset=False)  # type: ignore
+        return cast(Tuple[ndarray], raw_out)
     def _pre_transform_training_data(
+        self,
+        X: ndarray,
+        y: ndarray,
+        Xresampled: Union[ndarray, None],
+        variable_names: ArrayLike[str],
+        X_units: Union[ArrayLike[str], None],
+        y_units: Union[ArrayLike[str], str, None],
+        random_state: np.random.RandomState,
     ):
         """
         Transform the training data before fitting the symbolic regressor.
         Parameters
         ----------
+        X : ndarray
             Training data of shape (n_samples, n_features).
+        y : ndarray
             Target values of shape (n_samples,) or (n_samples, n_targets).
             Will be cast to X's dtype if necessary.
+        Xresampled : ndarray | None
             Resampled training data, of shape `(n_resampled, n_features)`,
             used for denoising.
         variable_names : list[str]
         """
         # Feature selection transformation
         if self.select_k_features:
+            selection_mask = run_feature_selection(
                 X, y, self.select_k_features, random_state=random_state
             )
+            X = X[:, selection_mask]
             if Xresampled is not None:
+                Xresampled = Xresampled[:, selection_mask]
             # Reduce variable_names to selection
+            variable_names = cast(
+                ArrayLike[str], [variable_names[i] for i in selection_mask]
+            )
             if X_units is not None:
+                X_units = cast(ArrayLike[str], [X_units[i] for i in selection_mask])
                 self.X_units_ = copy.deepcopy(X_units)
             # Re-perform data validation and feature name updating
             X, y = self._validate_data_X_y(X, y)
             # Update feature names with selected variable names
+            self.selection_mask_ = selection_mask
             self.feature_names_in_ = _check_feature_names_in(self, variable_names)
             self.display_feature_names_in_ = self.feature_names_in_
             print(f"Using features {self.feature_names_in_}")