Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
•
b7e75e1
1
Parent(s):
65a4248
Update all docstrings
Browse files- pysr/sr.py +55 -20
pysr/sr.py
CHANGED
@@ -17,6 +17,9 @@ is_julia_warning_silenced = False
|
|
17 |
|
18 |
|
19 |
def install(julia_project=None): # pragma: no cover
|
|
|
|
|
|
|
20 |
import julia
|
21 |
|
22 |
julia.install()
|
@@ -405,14 +408,26 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
405 |
:type binary_operators: list
|
406 |
:param unary_operators: Same but for operators taking a single scalar. Default is [].
|
407 |
:type unary_operators: list
|
408 |
-
:param procs: Number of processes (=number of populations running).
|
409 |
-
:type procs: int
|
410 |
-
:param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
|
411 |
-
:type loss: str
|
412 |
-
:param populations: Number of populations running.
|
413 |
-
:type populations: int
|
414 |
:param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
|
415 |
:type niterations: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
:param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
|
417 |
:type ncyclesperiteration: int
|
418 |
:param alpha: Initial temperature.
|
@@ -459,20 +474,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
459 |
:type verbosity: int
|
460 |
:param progress: Whether to use a progress bar instead of printing to stdout.
|
461 |
:type progress: bool
|
462 |
-
:param maxsize: Max size of an equation.
|
463 |
-
:type maxsize: int
|
464 |
:param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
|
465 |
:type maxdepth: int
|
466 |
:param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
|
467 |
:type fast_cycle: bool
|
468 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
469 |
:type variable_names: list
|
470 |
-
:param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
|
471 |
-
:type batching: bool
|
472 |
-
:param batchSize: the amount of data to use if doing batching.
|
473 |
-
:type batchSize: int
|
474 |
-
:param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
|
475 |
-
:type select_k_features: None/int
|
476 |
:param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
|
477 |
:type warmupMaxsizeBy: float
|
478 |
:param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
|
@@ -497,12 +504,8 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
497 |
:type tournament_selection_n: int
|
498 |
:param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
|
499 |
:type tournament_selection_p: float
|
500 |
-
:param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
|
501 |
-
:type denoise: bool
|
502 |
:param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
|
503 |
:type precision: int
|
504 |
-
:param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
|
505 |
-
:type multithreading: bool
|
506 |
:param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
|
507 |
:type **kwargs: dict
|
508 |
:returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
|
@@ -666,6 +669,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
666 |
]
|
667 |
|
668 |
def __repr__(self):
|
|
|
|
|
|
|
|
|
|
|
669 |
if self.equations is None:
|
670 |
return "PySRRegressor.equations = None"
|
671 |
|
@@ -712,7 +720,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
712 |
return output
|
713 |
|
714 |
def set_params(self, **params):
|
715 |
-
"""Set parameters for
|
716 |
for key, value in params.items():
|
717 |
if key in self.surface_parameters:
|
718 |
self.__setattr__(key, value)
|
@@ -723,6 +731,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
723 |
return self
|
724 |
|
725 |
def get_params(self, deep=True):
|
|
|
726 |
del deep
|
727 |
return {
|
728 |
**self.params,
|
@@ -730,6 +739,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
730 |
}
|
731 |
|
732 |
def get_best(self):
|
|
|
733 |
if self.equations is None:
|
734 |
raise ValueError("No equations have been generated yet.")
|
735 |
if self.model_selection == "accuracy":
|
@@ -746,7 +756,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
746 |
)
|
747 |
|
748 |
def fit(self, X, y, weights=None, variable_names=None):
|
749 |
-
"""Search for equations to fit the dataset.
|
750 |
|
751 |
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
|
752 |
:type X: np.ndarray/pandas.DataFrame
|
@@ -755,6 +765,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
755 |
:param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
|
756 |
:type weights: np.ndarray
|
757 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
|
|
758 |
:type variable_names: list
|
759 |
"""
|
760 |
if variable_names is None:
|
@@ -775,6 +786,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
775 |
self.equations = self.get_hof()
|
776 |
|
777 |
def predict(self, X):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
778 |
self.refresh()
|
779 |
best = self.get_best()
|
780 |
if self.multioutput:
|
@@ -782,6 +802,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
782 |
return best["lambda_format"](X)
|
783 |
|
784 |
def sympy(self):
|
|
|
785 |
self.refresh()
|
786 |
best = self.get_best()
|
787 |
if self.multioutput:
|
@@ -789,6 +810,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
789 |
return best["sympy_format"]
|
790 |
|
791 |
def latex(self):
|
|
|
792 |
self.refresh()
|
793 |
sympy_representation = self.sympy()
|
794 |
if self.multioutput:
|
@@ -796,6 +818,12 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
796 |
return sympy.latex(sympy_representation)
|
797 |
|
798 |
def jax(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
799 |
if self.using_pandas:
|
800 |
warnings.warn(
|
801 |
"PySR's JAX modules are not set up to work with a "
|
@@ -810,6 +838,13 @@ class PySRRegressor(BaseEstimator, RegressorMixin):
|
|
810 |
return best["jax_format"]
|
811 |
|
812 |
def pytorch(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
if self.using_pandas:
|
814 |
warnings.warn(
|
815 |
"PySR's PyTorch modules are not set up to work with a "
|
|
|
17 |
|
18 |
|
19 |
def install(julia_project=None): # pragma: no cover
|
20 |
+
"""Install PyCall.jl and all required dependencies for SymbolicRegression.jl.
|
21 |
+
|
22 |
+
Also updates the local Julia registry."""
|
23 |
import julia
|
24 |
|
25 |
julia.install()
|
|
|
408 |
:type binary_operators: list
|
409 |
:param unary_operators: Same but for operators taking a single scalar. Default is [].
|
410 |
:type unary_operators: list
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
:param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
|
412 |
:type niterations: int
|
413 |
+
:param populations: Number of populations running.
|
414 |
+
:type populations: int
|
415 |
+
:param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
|
416 |
+
:type loss: str
|
417 |
+
:param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
|
418 |
+
:type denoise: bool
|
419 |
+
:param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
|
420 |
+
:type select_k_features: None/int
|
421 |
+
:param procs: Number of processes (=number of populations running).
|
422 |
+
:type procs: int
|
423 |
+
:param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
|
424 |
+
:type multithreading: bool
|
425 |
+
:param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
|
426 |
+
:type batching: bool
|
427 |
+
:param batchSize: the amount of data to use if doing batching.
|
428 |
+
:type batchSize: int
|
429 |
+
:param maxsize: Max size of an equation.
|
430 |
+
:type maxsize: int
|
431 |
:param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
|
432 |
:type ncyclesperiteration: int
|
433 |
:param alpha: Initial temperature.
|
|
|
474 |
:type verbosity: int
|
475 |
:param progress: Whether to use a progress bar instead of printing to stdout.
|
476 |
:type progress: bool
|
|
|
|
|
477 |
:param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
|
478 |
:type maxdepth: int
|
479 |
:param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
|
480 |
:type fast_cycle: bool
|
481 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
482 |
:type variable_names: list
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
:param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
|
484 |
:type warmupMaxsizeBy: float
|
485 |
:param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
|
|
|
504 |
:type tournament_selection_n: int
|
505 |
:param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
|
506 |
:type tournament_selection_p: float
|
|
|
|
|
507 |
:param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
|
508 |
:type precision: int
|
|
|
|
|
509 |
:param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
|
510 |
:type **kwargs: dict
|
511 |
:returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
|
|
|
669 |
]
|
670 |
|
671 |
def __repr__(self):
|
672 |
+
"""Prints all current equations fitted by the model.
|
673 |
+
|
674 |
+
The string `>>>>` denotes which equation is selected by the
|
675 |
+
`model_selection`.
|
676 |
+
"""
|
677 |
if self.equations is None:
|
678 |
return "PySRRegressor.equations = None"
|
679 |
|
|
|
720 |
return output
|
721 |
|
722 |
def set_params(self, **params):
|
723 |
+
"""Set parameters for equation search."""
|
724 |
for key, value in params.items():
|
725 |
if key in self.surface_parameters:
|
726 |
self.__setattr__(key, value)
|
|
|
731 |
return self
|
732 |
|
733 |
def get_params(self, deep=True):
|
734 |
+
"""Get parameters for equation search."""
|
735 |
del deep
|
736 |
return {
|
737 |
**self.params,
|
|
|
739 |
}
|
740 |
|
741 |
def get_best(self):
|
742 |
+
"""Get best equation using `model_selection`."""
|
743 |
if self.equations is None:
|
744 |
raise ValueError("No equations have been generated yet.")
|
745 |
if self.model_selection == "accuracy":
|
|
|
756 |
)
|
757 |
|
758 |
def fit(self, X, y, weights=None, variable_names=None):
|
759 |
+
"""Search for equations to fit the dataset and store them in `self.equations`.
|
760 |
|
761 |
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
|
762 |
:type X: np.ndarray/pandas.DataFrame
|
|
|
765 |
:param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
|
766 |
:type weights: np.ndarray
|
767 |
:param variable_names: a list of names for the variables, other than "x0", "x1", etc.
|
768 |
+
You can also pass a pandas DataFrame for X.
|
769 |
:type variable_names: list
|
770 |
"""
|
771 |
if variable_names is None:
|
|
|
786 |
self.equations = self.get_hof()
|
787 |
|
788 |
def predict(self, X):
|
789 |
+
"""Predict y from input X using the equation chosen by `model_selection`.
|
790 |
+
|
791 |
+
You may see what equation is used by printing this object. X should have the same
|
792 |
+
columns as the training data.
|
793 |
+
|
794 |
+
:param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
|
795 |
+
:type X: np.ndarray/pandas.DataFrame
|
796 |
+
:return: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs).
|
797 |
+
"""
|
798 |
self.refresh()
|
799 |
best = self.get_best()
|
800 |
if self.multioutput:
|
|
|
802 |
return best["lambda_format"](X)
|
803 |
|
804 |
def sympy(self):
|
805 |
+
"""Return sympy representation of the equation(s) chosen by `model_selection`."""
|
806 |
self.refresh()
|
807 |
best = self.get_best()
|
808 |
if self.multioutput:
|
|
|
810 |
return best["sympy_format"]
|
811 |
|
812 |
def latex(self):
|
813 |
+
"""Return latex representation of the equation(s) chosen by `model_selection`."""
|
814 |
self.refresh()
|
815 |
sympy_representation = self.sympy()
|
816 |
if self.multioutput:
|
|
|
818 |
return sympy.latex(sympy_representation)
|
819 |
|
820 |
def jax(self):
|
821 |
+
"""Return jax representation of the equation(s) chosen by `model_selection`.
|
822 |
+
|
823 |
+
Each equation (multiple given if there are multiple outputs) is a dictionary
|
824 |
+
containing {"callable": func, "parameters": params}. To call `func`, pass
|
825 |
+
func(X, params). This function is differentiable using `jax.grad`.
|
826 |
+
"""
|
827 |
if self.using_pandas:
|
828 |
warnings.warn(
|
829 |
"PySR's JAX modules are not set up to work with a "
|
|
|
838 |
return best["jax_format"]
|
839 |
|
840 |
def pytorch(self):
|
841 |
+
"""Return pytorch representation of the equation(s) chosen by `model_selection`.
|
842 |
+
|
843 |
+
Each equation (multiple given if there are multiple outputs) is a PyTorch module
|
844 |
+
containing the parameters as trainable attributes. You can use the module like
|
845 |
+
any other PyTorch module: `module(X)`, where `X` is a tensor with the same
|
846 |
+
column ordering as trained with.
|
847 |
+
"""
|
848 |
if self.using_pandas:
|
849 |
warnings.warn(
|
850 |
"PySR's PyTorch modules are not set up to work with a "
|