tttc3 commited on
Commit
83d8e67
1 Parent(s): 2d0032e

Fixed issues outlined in pull request review

Browse files
Files changed (2) hide show
  1. pysr/export_numpy.py +9 -1
  2. pysr/sr.py +53 -30
pysr/export_numpy.py CHANGED
@@ -2,6 +2,7 @@
2
  import numpy as np
3
  import pandas as pd
4
  from sympy import lambdify
 
5
 
6
 
7
  class CallableEquation:
@@ -25,5 +26,12 @@ class CallableEquation:
25
  **{k: X[k].values for k in self._variable_names}
26
  ) * np.ones(expected_shape)
27
  if self._selection is not None:
28
- X = X[:, self._selection]
 
 
 
 
 
 
 
29
  return self._lambda(*X.T) * np.ones(expected_shape)
 
2
  import numpy as np
3
  import pandas as pd
4
  from sympy import lambdify
5
+ import warnings
6
 
7
 
8
  class CallableEquation:
 
26
  **{k: X[k].values for k in self._variable_names}
27
  ) * np.ones(expected_shape)
28
  if self._selection is not None:
29
+ if X.shape[1] != len(self._selection):
30
+ warnings.warn(
31
+ "`X` should be of shape (n_samples, len(self._selection)). "
32
+ "Automatically filtering `X` to selection. "
33
+ "Note: Filtered `X` column order may not match column order in fit "
34
+ "this may lead to incorrect predictions and other errors."
35
+ )
36
+ X = X[:, self._selection]
37
  return self._lambda(*X.T) * np.ones(expected_shape)
pysr/sr.py CHANGED
@@ -190,7 +190,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
190
  binary_operators : list[str], default=["+", "-", "*", "/"]
191
  List of strings giving the binary operators in Julia's Base.
192
 
193
- unary_operators : list[str], default=[]
194
  Same as :param`binary_operators` but for operators taking a
195
  single scalar.
196
 
@@ -226,7 +226,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
226
  timeout_in_seconds : float, default=None
227
  Make the search return early once this many seconds have passed.
228
 
229
- constraints : dict[str, int | tuple[int,int]], default={}
230
  Dictionary of int (unary) or 2-tuples (binary), this enforces
231
  maxsize constraints on the individual arguments of operators.
232
  E.g., `'pow': (-1, 1)` says that power laws can have any
@@ -462,7 +462,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
462
  Whether to create a 'torch_format' column in the output,
463
  containing a torch module with trainable parameters.
464
 
465
- extra_sympy_mappings : dict[str, Callable], default={}
466
  Provides mappings between custom :param`binary_operators` or
467
  :param`unary_operators` defined in julia strings, to those same
468
  operators defined in sympy.
@@ -470,13 +470,13 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
470
  model to be export to sympy, :param`extra_sympy_mappings`
471
  would be `{"inv": lambda x: 1/x}`.
472
 
473
- extra_jax_mappings : dict[Callable, str], default={}
474
  Similar to :param`extra_sympy_mappings` but for model export
475
  to jax. The dictionary maps sympy functions to jax functions.
476
  For example: `extra_jax_mappings={sympy.sin: "jnp.sin"}` maps
477
  the `sympy.sin` function to the equivalent jax expression `jnp.sin`.
478
 
479
- extra_torch_mappings : dict[Callable, Callable], default={}
480
  The same as :param`extra_jax_mappings` but for model export
481
  to pytorch. Note that the dictionary keys should be callable
482
  pytorch expressions.
@@ -571,13 +571,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
571
  self,
572
  model_selection="best",
573
  *,
574
- binary_operators=[
575
- "+",
576
- "-",
577
- "*",
578
- "/",
579
- ],
580
- unary_operators=[],
581
  niterations=40,
582
  populations=15,
583
  population_size=33,
@@ -586,7 +581,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
586
  maxdepth=None,
587
  warmup_maxsize_by=0.0,
588
  timeout_in_seconds=None,
589
- constraints={},
590
  nested_constraints=None,
591
  loss="L2DistLoss()",
592
  complexity_of_operators=None,
@@ -640,9 +635,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
640
  update=True,
641
  output_jax_format=False,
642
  output_torch_format=False,
643
- extra_sympy_mappings={},
644
- extra_torch_mappings={},
645
- extra_jax_mappings={},
646
  denoise=False,
647
  select_k_features=None,
648
  **kwargs,
@@ -888,6 +883,14 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
888
 
889
  """
890
  # Handle None values for instance parameters:
 
 
 
 
 
 
 
 
891
  if self.multithreading is None:
892
  # Default is multithreading=True, unless explicitly set,
893
  # or procs is set to 0 (serial mode).
@@ -1018,11 +1021,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1018
 
1019
  """
1020
  if isinstance(X, pd.DataFrame):
1021
- variable_names = None
1022
- warnings.warn(
1023
- ":param`variable_names` has been reset to `None` as `X` is a DataFrame. "
1024
- "Will use DataFrame column names instead."
1025
- )
 
1026
 
1027
  if X.columns.is_object() and X.columns.str.contains(" ").any():
1028
  X.columns = X.columns.str.replace(" ", "_")
@@ -1395,7 +1399,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1395
  self : object
1396
  Fitted Estimator.
1397
  """
1398
-
1399
  # Init attributes that are not specified in BaseEstimator
1400
  self.equations_ = None
1401
  self.nout_ = 1
@@ -1482,14 +1485,35 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1482
  ValueError
1483
  Raises if the `best_equation` cannot be evaluated.
1484
  """
1485
- check_is_fitted(self)
1486
-
1487
- if isinstance(X, pd.DataFrame):
1488
- X = X[self.feature_names_in_]
1489
- elif self.selection_mask_ is not None:
1490
- X = X[:, self.selection_mask_]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1491
 
1492
  X = self._validate_data(X, reset=False)
 
1493
  try:
1494
  if self.nout_ > 1:
1495
  return np.stack(
@@ -1685,8 +1709,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1685
  }
1686
 
1687
  sympy_symbols = [
1688
- sympy.Symbol(self.feature_names_in_[i])
1689
- for i in range(self.n_features_in_)
1690
  ]
1691
 
1692
  for _, eqn_row in output.iterrows():
 
190
  binary_operators : list[str], default=["+", "-", "*", "/"]
191
  List of strings giving the binary operators in Julia's Base.
192
 
193
+ unary_operators : list[str], default=None
194
  Same as :param`binary_operators` but for operators taking a
195
  single scalar.
196
 
 
226
  timeout_in_seconds : float, default=None
227
  Make the search return early once this many seconds have passed.
228
 
229
+ constraints : dict[str, int | tuple[int,int]], default=None
230
  Dictionary of int (unary) or 2-tuples (binary), this enforces
231
  maxsize constraints on the individual arguments of operators.
232
  E.g., `'pow': (-1, 1)` says that power laws can have any
 
462
  Whether to create a 'torch_format' column in the output,
463
  containing a torch module with trainable parameters.
464
 
465
+ extra_sympy_mappings : dict[str, Callable], default=None
466
  Provides mappings between custom :param`binary_operators` or
467
  :param`unary_operators` defined in julia strings, to those same
468
  operators defined in sympy.
 
470
  model to be export to sympy, :param`extra_sympy_mappings`
471
  would be `{"inv": lambda x: 1/x}`.
472
 
473
+ extra_jax_mappings : dict[Callable, str], default=None
474
  Similar to :param`extra_sympy_mappings` but for model export
475
  to jax. The dictionary maps sympy functions to jax functions.
476
  For example: `extra_jax_mappings={sympy.sin: "jnp.sin"}` maps
477
  the `sympy.sin` function to the equivalent jax expression `jnp.sin`.
478
 
479
+ extra_torch_mappings : dict[Callable, Callable], default=None
480
  The same as :param`extra_jax_mappings` but for model export
481
  to pytorch. Note that the dictionary keys should be callable
482
  pytorch expressions.
 
571
  self,
572
  model_selection="best",
573
  *,
574
+ binary_operators=None,
575
+ unary_operators=None,
 
 
 
 
 
576
  niterations=40,
577
  populations=15,
578
  population_size=33,
 
581
  maxdepth=None,
582
  warmup_maxsize_by=0.0,
583
  timeout_in_seconds=None,
584
+ constraints=None,
585
  nested_constraints=None,
586
  loss="L2DistLoss()",
587
  complexity_of_operators=None,
 
635
  update=True,
636
  output_jax_format=False,
637
  output_torch_format=False,
638
+ extra_sympy_mappings=None,
639
+ extra_torch_mappings=None,
640
+ extra_jax_mappings=None,
641
  denoise=False,
642
  select_k_features=None,
643
  **kwargs,
 
883
 
884
  """
885
  # Handle None values for instance parameters:
886
+ if self.binary_operators is None:
887
+ self.binary_operators = "+ * - /".split(" ")
888
+ if self.unary_operators is None:
889
+ self.unary_operators = []
890
+ if self.extra_sympy_mappings is None:
891
+ self.extra_sympy_mappings = {}
892
+ if self.constraints is None:
893
+ self.constraints = {}
894
  if self.multithreading is None:
895
  # Default is multithreading=True, unless explicitly set,
896
  # or procs is set to 0 (serial mode).
 
1021
 
1022
  """
1023
  if isinstance(X, pd.DataFrame):
1024
+ if variable_names:
1025
+ variable_names = None
1026
+ warnings.warn(
1027
+ ":param`variable_names` has been reset to `None` as `X` is a DataFrame. "
1028
+ "Will use DataFrame column names instead."
1029
+ )
1030
 
1031
  if X.columns.is_object() and X.columns.str.contains(" ").any():
1032
  X.columns = X.columns.str.replace(" ", "_")
 
1399
  self : object
1400
  Fitted Estimator.
1401
  """
 
1402
  # Init attributes that are not specified in BaseEstimator
1403
  self.equations_ = None
1404
  self.nout_ = 1
 
1485
  ValueError
1486
  Raises if the `best_equation` cannot be evaluated.
1487
  """
1488
+ check_is_fitted(self, attributes=["equations_", "feature_names_in_"])
1489
+
1490
+ # When X is an numpy array or a pandas dataframe with a RangeIndex,
1491
+ # the self.feature_names_in_ generated during fit, for the same X,
1492
+ # will cause a warning to be thrown during _validate_data.
1493
+ # To avoid this, convert X to a dataframe, apply the selection mask,
1494
+ # and then set the column/feature_names of X to be equal to those
1495
+ # generated during fit.
1496
+ if isinstance(X, np.ndarray):
1497
+ X = pd.DataFrame(X)
1498
+
1499
+ if isinstance(X.columns, pd.RangeIndex):
1500
+ if self.selection_mask_:
1501
+ # RangeIndex enforces column order allowing columns to
1502
+ # be correctly filtered with self.selection_mask_
1503
+ X = X.iloc[:, self.selection_mask_]
1504
+ X.columns = self.feature_names_in_
1505
+
1506
+ # Without feature information, CallableEquation/lambda_format equations
1507
+ # require that the column order of X matches that of the X used during
1508
+ # the fitting process. _validate_data removes this feature information
1509
+ # when it converts the dataframe to an np array. Thus, to ensure feature
1510
+ # order is preserved after conversion, the dataframe columns must be
1511
+ # reordered/reindexed to match those of the transformed (denoised and
1512
+ # feature selected) X in fit.
1513
+ X = X.reindex(columns=self.feature_names_in_)
1514
 
1515
  X = self._validate_data(X, reset=False)
1516
+
1517
  try:
1518
  if self.nout_ > 1:
1519
  return np.stack(
 
1709
  }
1710
 
1711
  sympy_symbols = [
1712
+ sympy.Symbol(variable) for variable in self.feature_names_in_
 
1713
  ]
1714
 
1715
  for _, eqn_row in output.iterrows():