MilesCranmer commited on
Commit
b7e75e1
1 Parent(s): 65a4248

Update all docstrings

Browse files
Files changed (1) hide show
  1. pysr/sr.py +55 -20
pysr/sr.py CHANGED
@@ -17,6 +17,9 @@ is_julia_warning_silenced = False
17
 
18
 
19
  def install(julia_project=None): # pragma: no cover
 
 
 
20
  import julia
21
 
22
  julia.install()
@@ -405,14 +408,26 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
405
  :type binary_operators: list
406
  :param unary_operators: Same but for operators taking a single scalar. Default is [].
407
  :type unary_operators: list
408
- :param procs: Number of processes (=number of populations running).
409
- :type procs: int
410
- :param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
411
- :type loss: str
412
- :param populations: Number of populations running.
413
- :type populations: int
414
  :param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
415
  :type niterations: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  :param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
417
  :type ncyclesperiteration: int
418
  :param alpha: Initial temperature.
@@ -459,20 +474,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
459
  :type verbosity: int
460
  :param progress: Whether to use a progress bar instead of printing to stdout.
461
  :type progress: bool
462
- :param maxsize: Max size of an equation.
463
- :type maxsize: int
464
  :param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
465
  :type maxdepth: int
466
  :param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
467
  :type fast_cycle: bool
468
  :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
469
  :type variable_names: list
470
- :param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
471
- :type batching: bool
472
- :param batchSize: the amount of data to use if doing batching.
473
- :type batchSize: int
474
- :param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
475
- :type select_k_features: None/int
476
  :param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
477
  :type warmupMaxsizeBy: float
478
  :param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
@@ -497,12 +504,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
497
  :type tournament_selection_n: int
498
  :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
499
  :type tournament_selection_p: float
500
- :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
501
- :type denoise: bool
502
  :param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
503
  :type precision: int
504
- :param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
505
- :type multithreading: bool
506
  :param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
507
  :type **kwargs: dict
508
  :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
@@ -666,6 +669,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
666
  ]
667
 
668
  def __repr__(self):
 
 
 
 
 
669
  if self.equations is None:
670
  return "PySRRegressor.equations = None"
671
 
@@ -712,7 +720,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
712
  return output
713
 
714
  def set_params(self, **params):
715
- """Set parameters for pysr.pysr call or model_selection strategy."""
716
  for key, value in params.items():
717
  if key in self.surface_parameters:
718
  self.__setattr__(key, value)
@@ -723,6 +731,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
723
  return self
724
 
725
  def get_params(self, deep=True):
 
726
  del deep
727
  return {
728
  **self.params,
@@ -730,6 +739,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
730
  }
731
 
732
  def get_best(self):
 
733
  if self.equations is None:
734
  raise ValueError("No equations have been generated yet.")
735
  if self.model_selection == "accuracy":
@@ -746,7 +756,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
746
  )
747
 
748
  def fit(self, X, y, weights=None, variable_names=None):
749
- """Search for equations to fit the dataset.
750
 
751
  :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
752
  :type X: np.ndarray/pandas.DataFrame
@@ -755,6 +765,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
755
  :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
756
  :type weights: np.ndarray
757
  :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
 
758
  :type variable_names: list
759
  """
760
  if variable_names is None:
@@ -775,6 +786,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
775
  self.equations = self.get_hof()
776
 
777
  def predict(self, X):
 
 
 
 
 
 
 
 
 
778
  self.refresh()
779
  best = self.get_best()
780
  if self.multioutput:
@@ -782,6 +802,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
782
  return best["lambda_format"](X)
783
 
784
  def sympy(self):
 
785
  self.refresh()
786
  best = self.get_best()
787
  if self.multioutput:
@@ -789,6 +810,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
789
  return best["sympy_format"]
790
 
791
  def latex(self):
 
792
  self.refresh()
793
  sympy_representation = self.sympy()
794
  if self.multioutput:
@@ -796,6 +818,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
796
  return sympy.latex(sympy_representation)
797
 
798
  def jax(self):
 
 
 
 
 
 
799
  if self.using_pandas:
800
  warnings.warn(
801
  "PySR's JAX modules are not set up to work with a "
@@ -810,6 +838,13 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
810
  return best["jax_format"]
811
 
812
  def pytorch(self):
 
 
 
 
 
 
 
813
  if self.using_pandas:
814
  warnings.warn(
815
  "PySR's PyTorch modules are not set up to work with a "
 
17
 
18
 
19
  def install(julia_project=None): # pragma: no cover
20
+ """Install PyCall.jl and all required dependencies for SymbolicRegression.jl.
21
+
22
+ Also updates the local Julia registry."""
23
  import julia
24
 
25
  julia.install()
 
408
  :type binary_operators: list
409
  :param unary_operators: Same but for operators taking a single scalar. Default is [].
410
  :type unary_operators: list
 
 
 
 
 
 
411
  :param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
412
  :type niterations: int
413
+ :param populations: Number of populations running.
414
+ :type populations: int
415
+ :param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
416
+ :type loss: str
417
+ :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
418
+ :type denoise: bool
419
+ :param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
420
+ :type select_k_features: None/int
421
+ :param procs: Number of processes (=number of populations running).
422
+ :type procs: int
423
+ :param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
424
+ :type multithreading: bool
425
+ :param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
426
+ :type batching: bool
427
+ :param batchSize: the amount of data to use if doing batching.
428
+ :type batchSize: int
429
+ :param maxsize: Max size of an equation.
430
+ :type maxsize: int
431
  :param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
432
  :type ncyclesperiteration: int
433
  :param alpha: Initial temperature.
 
474
  :type verbosity: int
475
  :param progress: Whether to use a progress bar instead of printing to stdout.
476
  :type progress: bool
 
 
477
  :param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
478
  :type maxdepth: int
479
  :param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
480
  :type fast_cycle: bool
481
  :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
482
  :type variable_names: list
 
 
 
 
 
 
483
  :param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
484
  :type warmupMaxsizeBy: float
485
  :param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
 
504
  :type tournament_selection_n: int
505
  :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
506
  :type tournament_selection_p: float
 
 
507
  :param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
508
  :type precision: int
 
 
509
  :param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
510
  :type **kwargs: dict
511
  :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
 
669
  ]
670
 
671
  def __repr__(self):
672
+ """Prints all current equations fitted by the model.
673
+
674
+ The string `>>>>` denotes which equation is selected by the
675
+ `model_selection`.
676
+ """
677
  if self.equations is None:
678
  return "PySRRegressor.equations = None"
679
 
 
720
  return output
721
 
722
  def set_params(self, **params):
723
+ """Set parameters for equation search."""
724
  for key, value in params.items():
725
  if key in self.surface_parameters:
726
  self.__setattr__(key, value)
 
731
  return self
732
 
733
  def get_params(self, deep=True):
734
+ """Get parameters for equation search."""
735
  del deep
736
  return {
737
  **self.params,
 
739
  }
740
 
741
  def get_best(self):
742
+ """Get best equation using `model_selection`."""
743
  if self.equations is None:
744
  raise ValueError("No equations have been generated yet.")
745
  if self.model_selection == "accuracy":
 
756
  )
757
 
758
  def fit(self, X, y, weights=None, variable_names=None):
759
+ """Search for equations to fit the dataset and store them in `self.equations`.
760
 
761
  :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
762
  :type X: np.ndarray/pandas.DataFrame
 
765
  :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
766
  :type weights: np.ndarray
767
  :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
768
+ You can also pass a pandas DataFrame for X.
769
  :type variable_names: list
770
  """
771
  if variable_names is None:
 
786
  self.equations = self.get_hof()
787
 
788
  def predict(self, X):
789
+ """Predict y from input X using the equation chosen by `model_selection`.
790
+
791
+ You may see what equation is used by printing this object. X should have the same
792
+ columns as the training data.
793
+
794
+ :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
795
+ :type X: np.ndarray/pandas.DataFrame
796
+ :return: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs).
797
+ """
798
  self.refresh()
799
  best = self.get_best()
800
  if self.multioutput:
 
802
  return best["lambda_format"](X)
803
 
804
  def sympy(self):
805
+ """Return sympy representation of the equation(s) chosen by `model_selection`."""
806
  self.refresh()
807
  best = self.get_best()
808
  if self.multioutput:
 
810
  return best["sympy_format"]
811
 
812
  def latex(self):
813
+ """Return latex representation of the equation(s) chosen by `model_selection`."""
814
  self.refresh()
815
  sympy_representation = self.sympy()
816
  if self.multioutput:
 
818
  return sympy.latex(sympy_representation)
819
 
820
  def jax(self):
821
+ """Return jax representation of the equation(s) chosen by `model_selection`.
822
+
823
+ Each equation (multiple given if there are multiple outputs) is a dictionary
824
+ containing {"callable": func, "parameters": params}. To call `func`, pass
825
+ func(X, params). This function is differentiable using `jax.grad`.
826
+ """
827
  if self.using_pandas:
828
  warnings.warn(
829
  "PySR's JAX modules are not set up to work with a "
 
838
  return best["jax_format"]
839
 
840
  def pytorch(self):
841
+ """Return pytorch representation of the equation(s) chosen by `model_selection`.
842
+
843
+ Each equation (multiple given if there are multiple outputs) is a PyTorch module
844
+ containing the parameters as trainable attributes. You can use the module like
845
+ any other PyTorch module: `module(X)`, where `X` is a tensor with the same
846
+ column ordering as trained with.
847
+ """
848
  if self.using_pandas:
849
  warnings.warn(
850
  "PySR's PyTorch modules are not set up to work with a "