diff --git "a/pysr/sr.py" "b/pysr/sr.py" --- "a/pysr/sr.py" +++ "b/pysr/sr.py" @@ -72,7 +72,7 @@ sympy_mappings = { def pysr(X, y, weights=None, **kwargs): # pragma: no cover warnings.warn( "Calling `pysr` is deprecated. Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.", - DeprecationWarning, + FutureWarning, ) model = PySRRegressor(**kwargs) model.fit(X, y, weights=weights) @@ -124,17 +124,6 @@ def _create_inline_operators(binary_operators, unary_operators): op_list[i] = function_name -def _handle_feature_selection(X, select_k_features, y, variable_names): - if select_k_features is not None: - selection = run_feature_selection(X, y, select_k_features) - print(f"Using features {[variable_names[i] for i in selection]}") - X = X[:, selection] - - else: - selection = None - return X, selection - - def _check_assertions( X, binary_operators, @@ -156,29 +145,6 @@ def _check_assertions( assert len(variable_names) == X.shape[1] -def run_feature_selection(X, y, select_k_features): - """Use a gradient boosting tree regressor as a proxy for finding - the k most important features in X, returning indices for those - features as output.""" - - from sklearn.ensemble import RandomForestRegressor - from sklearn.feature_selection import SelectFromModel - - clf = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0) - clf.fit(X, y) - selector = SelectFromModel( - clf, threshold=-np.inf, max_features=select_k_features, prefit=True - ) - return selector.get_support(indices=True) - - -def _escape_filename(filename): - """Turns a file into a string representation with correctly escaped backslashes""" - str_repr = str(filename) - str_repr = str_repr.replace("\\", "\\\\") - return str_repr - - def best(*args, **kwargs): # pragma: no cover raise NotImplementedError( "`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation." @@ -203,20 +169,6 @@ def best_callable(*args, **kwargs): # pragma: no cover ) -def _denoise(X, y, Xresampled=None): - """Denoise the dataset using a Gaussian process""" - from sklearn.gaussian_process import GaussianProcessRegressor - from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel - - gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel() - gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50) - gpr.fit(X, y) - if Xresampled is not None: - return Xresampled, gpr.predict(Xresampled) - - return X, gpr.predict(X) - - class CallableEquation: """Simple wrapper for numpy lambda functions built with sympy""" @@ -234,548 +186,608 @@ class CallableEquation: expected_shape = (X.shape[0],) if isinstance(X, pd.DataFrame): # Lambda function takes as argument: - return self._lambda(**{k: X[k].values for k in X.columns}) * np.ones( - expected_shape - ) - elif self._selection is not None: - return self._lambda(*X[:, self._selection].T) * np.ones(expected_shape) - return self._lambda(*X.T) * np.ones(expected_shape) + return self._lambda( + **{k: X[k].values for k in self._variable_names} + ) * np.ones(expected_shape) + else: + if self._selection is not None: + X = X[:, self._selection] + return self._lambda(*X.T) * np.ones(expected_shape) + + +class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin): + """ + Symbolic regression - scikit-learn interface for SymbolicRegression.jl. + + Parameters + ---------- + model_selection : str, default="best" + Model selection criterion. Can be 'accuracy' or 'best'. + `"accuracy"` selects the candidate model with the lowest loss + (highest accuracy). `"best"` selects the candidate model with + the lowest sum of normalized loss and complexity. + + binary_operators : list[str], default=["+", "-", "*", "/"] + List of strings giving the binary operators in Julia's Base. + + unary_operators : list[str], default=[] + Same as :param`binary_operators` but for operators taking a + single scalar. + + niterations : int, default=40 + Number of iterations of the algorithm to run. The best + equations are printed and migrate between populations at the + end of each iteration. + + populations : int, default=15 + Number of populations running. + + population_size : int, default=33 + Number of individuals in each population. + + max_evals : int, default=None + Limits the total number of evaluations of expressions to + this number. + + maxsize : int, default=20 + Max size of an equation. + + maxdepth : int, default=None + Max depth of an equation. You can use both :param`maxsize` and + :param`maxdepth`. :param`maxdepth` is by default set to equal + :param`maxsize`, which means that it is redundant. + + warmup_maxsize_by : float, default=0.0 + Whether to slowly increase max size from a small number up to + the maxsize (if greater than 0). If greater than 0, says the + fraction of training time at which the current maxsize will + reach the user-passed maxsize. + + timeout_in_seconds : float, default=None + Make the search return early once this many seconds have passed. + + constraints : dict[str, int | tuple[int,int]], default={} + Dictionary of int (unary) or 2-tuples (binary), this enforces + maxsize constraints on the individual arguments of operators. + E.g., `'pow': (-1, 1)` says that power laws can have any + complexity left argument, but only 1 complexity exponent. Use + this to force more interpretable solutions. + + nested_constraints : dict[str, dict], default=None + Specifies how many times a combination of operators can be + nested. For example, `{"sin": {"cos": 0}}, "cos": {"cos": 2}}` + specifies that `cos` may never appear within a `sin`, but `sin` + can be nested with itself an unlimited number of times. The + second term specifies that `cos` can be nested up to 2 times + within a `cos`, so that `cos(cos(cos(x)))` is allowed + (as well as any combination of `+` or `-` within it), but + `cos(cos(cos(cos(x))))` is not allowed. When an operator is not + specified, it is assumed that it can be nested an unlimited + number of times. This requires that there is no operator which + is used both in the unary operators and the binary operators + (e.g., `-` could be both subtract, and negation). For binary + operators, you only need to provide a single number: both + arguments are treated the same way, and the max of each + argument is constrained. + + loss : str, default="L2DistLoss()" + String of Julia code specifying the loss function. Can either + be a loss from LossFunctions.jl, or your own loss written as a + function. Examples of custom written losses include: + `myloss(x, y) = abs(x-y)` for non-weighted, or + `myloss(x, y, w) = w*abs(x-y)` for weighted. + + Among the included losses, these are as follows. + Regression: `LPDistLoss{P}()`, `L1DistLoss()`, + `L2DistLoss()` (mean square), `LogitDistLoss()`, + `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, + `PeriodicLoss(c)`, `QuantileLoss(τ)`. + Classification: `ZeroOneLoss()`, `PerceptronLoss()`, + `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, + `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, + `SigmoidLoss()`, `DWDMarginLoss(q)`. + + complexity_of_operators : dict[str, float], default=None + If you would like to use a complexity other than 1 for an + operator, specify the complexity here. For example, + `{"sin": 2, "+": 1}` would give a complexity of 2 for each use + of the `sin` operator, and a complexity of 1 for each use of + the `+` operator (which is the default). You may specify real + numbers for a complexity, and the total complexity of a tree + will be rounded to the nearest integer after computing. + + complexity_of_constants : float, default=1 + Complexity of constants. + + complexity_of_variables : float, default=1 + Complexity of variables. + + parsimony : float, default=0.0032 + Multiplicative factor for how much to punish complexity. + + use_frequency : bool, default=True + Whether to measure the frequency of complexities, and use that + instead of parsimony to explore equation space. Will naturally + find equations of all complexities. + + use_frequency_in_tournament : bool, default=True + Whether to use the frequency mentioned above in the tournament, + rather than just the simulated annealing. + + alpha : float, default=0.1 + Initial temperature for simulated annealing + (requires :param`annealing` to be `True`). + + annealing : bool, default=True + Whether to use annealing. You should (and it is default). + + early_stop_condition : float, default=None + Stop the search early if this loss is reached. + + ncyclesperiteration : int, default=550 + Number of total mutations to run, per 10 samples of the + population, per iteration. + + fraction_replaced : float, default=0.000364 + How much of population to replace with migrating equations from + other populations. + + fraction_replaced_hof : float, default=0.035 + How much of population to replace with migrating equations from + hall of fame. + + weight_add_node : float, default=0.79 + Relative likelihood for mutation to add a node. + + weight_insert_node : float, default=5.1 + Relative likelihood for mutation to insert a node. + + weight_delete_node : float, default=1.7 + Relative likelihood for mutation to delete a node. + + weight_do_nothing : float, default=0.21 + Relative likelihood for mutation to leave the individual. + + weight_mutate_constant : float, default=0.048 + Relative likelihood for mutation to change the constant slightly in a random direction. + + weight_mutate_operator : float, default=0.47 + Relative likelihood for mutation to swap an operator. + + weight_randomize : float, default=0.00023 + Relative likelihood for mutation to completely delete and then randomly generate the equation + + weight_simplify : float, default=0.0020 + Relative likelihood for mutation to simplify constant parts by evaluation + + crossover_probability : float, default=0.066 + Absolute probability of crossover-type genetic operation, instead of a mutation. + skip_mutation_failures : bool, default=True + Whether to skip mutation and crossover failures, rather than + simply re-sampling the current member. -def _get_julia_project(julia_project): - if julia_project is None: - is_shared = True - julia_project = f"pysr-{__version__}" - else: - is_shared = False - julia_project = Path(julia_project) - return julia_project, is_shared + migration : bool, default=True + Whether to migrate. + hof_migration : bool, default=True + Whether to have the hall of fame migrate. -def is_julia_version_greater_eq(Main, version="1.6"): - """Check if Julia version is greater than specified version.""" - return Main.eval(f'VERSION >= v"{version}"') + topn : int, default=12 + How many top individuals migrate from each population. + should_optimize_constants : bool, default=True + Whether to numerically optimize constants (Nelder-Mead/Newton) + at the end of each iteration. -def init_julia(): - """Initialize julia binary, turning off compiled modules if needed.""" - from julia.core import JuliaInfo, UnsupportedPythonError + optimizer_algorithm : str, default="BFGS" + Optimization scheme to use for optimizing constants. Can currently + be `NelderMead` or `BFGS`. - try: - info = JuliaInfo.load(julia="julia") - except FileNotFoundError: - env_path = os.environ["PATH"] - raise FileNotFoundError( - f"Julia is not installed in your PATH. Please install Julia and add it to your PATH.\n\nCurrent PATH: {env_path}", - ) + optimizer_nrestarts : int, default=2 + Number of time to restart the constants optimization process with + different initial conditions. - if not info.is_pycall_built(): - raise ImportError(import_error_string()) + optimize_probability : float, default=0.14 + Probability of optimizing the constants during a single iteration of + the evolutionary algorithm. - Main = None - try: - from julia import Main as _Main + optimizer_iterations : int, default=8 + Number of iterations that the constants optimizer can take. - Main = _Main - except UnsupportedPythonError: - # Static python binary, so we turn off pre-compiled modules. - from julia.core import Julia + perturbation_factor : float, default=0.076 + Constants are perturbed by a max factor of + (perturbation_factor*T + 1). Either multiplied by this or + divided by this. - jl = Julia(compiled_modules=False) - from julia import Main as _Main + tournament_selection_n : int, default=10 + Number of expressions to consider in each tournament. - Main = _Main + tournament_selection_p : float, default=0.86 + Probability of selecting the best expression in each + tournament. The probability will decay as p*(1-p)^n for other + expressions, sorted by loss. - return Main + procs : int, default=multiprocessing.cpu_count() + Number of processes (=number of populations running). + multithreading : bool, default=True + Use multithreading instead of distributed backend. + Using procs=0 will turn off both. -def _add_sr_to_julia_project(Main, io_arg): - Main.sr_spec = Main.PackageSpec( - name="SymbolicRegression", - url="https://github.com/MilesCranmer/SymbolicRegression.jl", - rev="v" + __symbolic_regression_jl_version__, - ) - Main.eval(f"Pkg.add(sr_spec, {io_arg})") - Main.clustermanagers_spec = Main.PackageSpec( - name="ClusterManagers", - url="https://github.com/JuliaParallel/ClusterManagers.jl", - rev="14e7302f068794099344d5d93f71979aaf4fbeb3", - ) - Main.eval(f"Pkg.add(clustermanagers_spec, {io_arg})") + cluster_manager : str, default=None + For distributed computing, this sets the job queue system. Set + to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or + "htc". If set to one of these, PySR will run in distributed + mode, and use `procs` to figure out how many processes to launch. + + batching : bool, default=False + Whether to compare population members on small batches during + evolution. Still uses full dataset for comparing against hall + of fame. + + batch_size : int, default=50 + The amount of data to use if doing batching. + + fast_cycle : bool, default=False (experimental) + Batch over population subsamples. This is a slightly different + algorithm than regularized evolution, but does cycles 15% + faster. May be algorithmically less efficient. + + precision : int, default=32 + What precision to use for the data. By default this is 32 + (float32), but you can select 64 or 16 as well. + + verbosity : int, default=1e9 + What verbosity level to use. 0 means minimal print statements. + update_verbosity : int, default=None + What verbosity level to use for package updates. + Will take value of :param`verbosity` if not given. + + progress : bool, default=True + Whether to use a progress bar instead of printing to stdout. + + equation_file : str, default=None + Where to save the files (.csv separated by |). + + temp_equation_file : + Whether to put the hall of fame file in the temp directory. + Deletion is then controlled with the :param`delete_tempfiles` + parameter. + + tempdir : str, default=None + directory for the temporary files. + + delete_tempfiles : bool, default=True + Whether to delete the temporary files after finishing. + + julia_project : str, default=None + A Julia environment location containing a Project.toml + (and potentially the source code for SymbolicRegression.jl). + Default gives the Python package directory, where a + Project.toml file should be present from the install. + + update: bool, default=True + Whether to automatically update Julia packages. + + output_jax_format : bool, default=False + Whether to create a 'jax_format' column in the output, + containing jax-callable functions and the default parameters in + a jax array. + + output_torch_format : bool, default=False + Whether to create a 'torch_format' column in the output, + containing a torch module with trainable parameters. + + extra_sympy_mappings : dict[str, Callable], default={} + Provides mappings between custom :param`binary_operators` or + :param`unary_operators` defined in julia strings, to those same + operators defined in sympy. + E.G if `unary_operators=["inv(x)=1/x"]`, then for the fitted + model to be export to sympy, :param`extra_sympy_mappings` + would be `{"inv": lambda x: 1/x}`. + + extra_jax_mappings : dict[Callable, str], default={} + Similar to :param`extra_sympy_mappings` but for model export + to jax. The dictionary maps sympy functions to jax functions. + For example: `extra_jax_mappings={sympy.sin: "jnp.sin"}` maps + the `sympy.sin` function to the equivalent jax expression `jnp.sin`. + + extra_torch_mappings : dict[Callable, Callable], default={} + The same as :param`extra_jax_mappings` but for model export + to pytorch. Note that the dictionary keys should be callable + pytorch expressions. + For example: `extra_torch_mappings={sympy.sin: torch.sin}` + + denoise : bool, default=False + Whether to use a Gaussian Process to denoise the data before + inputting to PySR. Can help PySR fit noisy data. + + select_k_features : int, default=None + whether to run feature selection in Python using random forests, + before passing to the symbolic regression code. None means no + feature selection; an int means select that many features. + + kwargs : dict, default=None + Supports deprecated keyword arguments. Other arguments will + result in an error. + + Attributes + ---------- + equations_ : pandas.DataFrame + DataFrame containing the results of model fitting. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + nout_ : int + Number of output dimensions. + + selection_mask_ : list[int] of length `select_k_features` + List of indices for input features that are selected when + :param`select_k_features` is set. + + raw_julia_state_ : tuple[list[PyCall.jlwrap], PyCall.jlwrap] + The state for the julia SymbolicRegression.jl backend post fitting. + + Notes + ----- + Most default parameters have been tuned over several example equations, + but you should adjust `niterations`, `binary_operators`, `unary_operators` + to your requirements. You can view more detailed explanations of the options + on the [options page](https://astroautomata.com/PySR/#/options) of the + documentation. + + Examples + -------- + >>> import numpy as np + >>> from pysr import PySRRegressor + >>> randstate = np.random.RandomState(0) + >>> X = 2 * randstate.randn(100, 5) + >>> # y = 2.5372 * cos(x_3) + x_0 - 0.5 + >>> y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5 + >>> model = PySRRegressor( + >>> niterations=40, + >>> binary_operators=["+", "*"], + >>> unary_operators=[ + >>> "cos", + >>> "exp", + >>> "sin", + >>> "inv(x) = 1/x", # Custom operator (julia syntax) + >>> ], + >>> model_selection="best", + >>> loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax) + >>> ) + >>> model.fit(X, y) + PySRRegressor.equations = [0e-02 (((x0 * x0) + ((cos(x3) * inv(sin(sin(cos(0.5076455))))) * (0.79839814 + sin(inv(0.5623672))))) + -0.5378383) pick score equation loss complexity + 0 0.000000 3.8552167 3.360272e+01 1 + 1 1.189847 (x0 * x0) 3.110905e+00 3 + 2 0.010626 ((x0 * x0) + -0.25573406) 3.045491e+00 5 + 3 0.896632 (cos(x3) + (x0 * x0)) 1.242382e+00 6 + 4 0.811362 ((x0 * x0) + (cos(x3) * 2.4384754)) 2.451971e-01 8 + 5 >>>> 13.733371 (((cos(x3) * 2.5382) + (x0 * x0)) + -0.5) 2.889755e-13 10 + 6 0.194695 ((x0 * x0) + (((cos(x3) + -0.063180044) * 2.53... 1.957723e-13 12 + 7 0.006988 ((x0 * x0) + (((cos(x3) + -0.32505524) * 1.538... 1.944089e-13 13 + 8 0.000955 (((((x0 * x0) + cos(x3)) + -0.8251649) + (cos(... 1.940381e-13 15 + ] + >>> model.score(X, y) + 1.0 + >>> model.predict(np.array([1,2,3,4,5])) + array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818]) + """ + + # Class validation constants + VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"] -class PySRRegressor(BaseEstimator, RegressorMixin): def __init__( self, model_selection="best", *, - weights=None, - binary_operators=None, - unary_operators=None, - procs=cpu_count(), - loss="L2DistLoss()", - complexity_of_operators=None, - complexity_of_constants=None, - complexity_of_variables=None, - populations=15, + binary_operators=[ + "+", + "-", + "*", + "/", + ], + unary_operators=[], niterations=40, - ncyclesperiteration=550, + populations=15, + population_size=33, + max_evals=None, + maxsize=20, + maxdepth=None, + warmup_maxsize_by=0.0, timeout_in_seconds=None, + constraints={}, + nested_constraints=None, + loss="L2DistLoss()", + complexity_of_operators=None, + complexity_of_constants=1, + complexity_of_variables=1, + parsimony=0.0032, + use_frequency=True, + use_frequency_in_tournament=True, alpha=0.1, - annealing=False, + annealing=True, + early_stop_condition=None, + ncyclesperiteration=550, fraction_replaced=0.000364, fraction_replaced_hof=0.035, - population_size=33, - parsimony=0.0032, - migration=True, - hof_migration=True, - should_optimize_constants=True, - topn=12, weight_add_node=0.79, + weight_insert_node=5.1, weight_delete_node=1.7, weight_do_nothing=0.21, - weight_insert_node=5.1, weight_mutate_constant=0.048, weight_mutate_operator=0.47, weight_randomize=0.00023, weight_simplify=0.0020, crossover_probability=0.066, + skip_mutation_failures=True, + migration=True, + hof_migration=True, + topn=12, + should_optimize_constants=True, + optimizer_algorithm="BFGS", + optimizer_nrestarts=2, + optimize_probability=0.14, + optimizer_iterations=8, perturbation_factor=0.076, - extra_sympy_mappings=None, - extra_torch_mappings=None, - extra_jax_mappings=None, - equation_file=None, - verbosity=1e9, - update_verbosity=None, - progress=None, - maxsize=20, - fast_cycle=False, - maxdepth=None, - variable_names=None, + tournament_selection_n=10, + tournament_selection_p=0.86, + procs=cpu_count(), + multithreading=None, + cluster_manager=None, batching=False, batch_size=50, - select_k_features=None, - warmup_maxsize_by=0.0, - constraints=None, - nested_constraints=None, - use_frequency=True, - use_frequency_in_tournament=True, + fast_cycle=False, + precision=32, + verbosity=1e9, + update_verbosity=None, + progress=True, + equation_file=None, + temp_equation_file=False, tempdir=None, delete_tempfiles=True, julia_project=None, update=True, - temp_equation_file=False, output_jax_format=False, output_torch_format=False, - optimizer_algorithm="BFGS", - optimizer_nrestarts=2, - optimize_probability=0.14, - optimizer_iterations=8, - tournament_selection_n=10, - tournament_selection_p=0.86, + extra_sympy_mappings={}, + extra_torch_mappings={}, + extra_jax_mappings={}, denoise=False, - Xresampled=None, - precision=32, - multithreading=None, - cluster_manager=None, - skip_mutation_failures=True, - max_evals=None, - early_stop_condition=None, - # To support deprecated kwargs: + select_k_features=None, **kwargs, ): - """Initialize settings for an equation search in PySR. - - Note: most default parameters have been tuned over several example - equations, but you should adjust `niterations`, - `binary_operators`, `unary_operators` to your requirements. - You can view more detailed explanations of the options on the - [options page](https://astroautomata.com/PySR/#/options) of the documentation. - - :param model_selection: How to select a model. Can be 'accuracy' or 'best'. The default, 'best', will optimize a combination of complexity and accuracy. - :type model_selection: str - :param binary_operators: List of strings giving the binary operators in Julia's Base. Default is ["+", "-", "*", "/",]. - :type binary_operators: list - :param unary_operators: Same but for operators taking a single scalar. Default is []. - :type unary_operators: list - :param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each. - :type niterations: int - :param populations: Number of populations running. - :type populations: int - :param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`. - :type loss: str - :param complexity_of_operators: If you would like to use a complexity other than 1 for - an operator, specify the complexity here. For example, `{"sin": 2, "+": 1}` would give - a complexity of 2 for each use of the `sin` operator, and a complexity of 1 - for each use of the `+` operator (which is the default). You may specify - real numbers for a complexity, and the total complexity of a tree will be rounded - to the nearest integer after computing. - :type complexity_of_operators: dict - :param complexity_of_constants: Complexity of constants. Default is 1. - :type complexity_of_constants: int/float - :param complexity_of_variables: Complexity of variables. Default is 1. - :type complexity_of_variables: int/float - :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data. - :type denoise: bool - :param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features. - :type select_k_features: None/int - :param procs: Number of processes (=number of populations running). - :type procs: int - :param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both. - :type multithreading: bool - :param cluster_manager: For distributed computing, this sets the job queue - system. Set to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or "htc". - If set to one of these, PySR will run in distributed mode, and use `procs` to figure - out how many processes to launch. - :type cluster_manager: str - :param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame. - :type batching: bool - :param batch_size: the amount of data to use if doing batching. - :type batch_size: int - :param maxsize: Max size of an equation. - :type maxsize: int - :param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration. - :type ncyclesperiteration: int - :param timeout_in_seconds: Make the search return early once this many seconds have passed. - :type timeout_in_seconds: float/int - :param alpha: Initial temperature. - :type alpha: float - :param annealing: Whether to use annealing. You should (and it is default). - :type annealing: bool - :param fraction_replaced: How much of population to replace with migrating equations from other populations. - :type fraction_replaced: float - :param fraction_replaced_hof: How much of population to replace with migrating equations from hall of fame. - :type fraction_replaced_hof: float - :param population_size: Number of individuals in each population - :type population_size: int - :param parsimony: Multiplicative factor for how much to punish complexity. - :type parsimony: float - :param migration: Whether to migrate. - :type migration: bool - :param hof_migration: Whether to have the hall of fame migrate. - :type hof_migration: bool - :param should_optimize_constants: Whether to numerically optimize constants (Nelder-Mead/Newton) at the end of each iteration. - :type should_optimize_constants: bool - :param topn: How many top individuals migrate from each population. - :type topn: int - :param perturbation_factor: Constants are perturbed by a max factor of (perturbation_factor*T + 1). Either multiplied by this or divided by this. - :type perturbation_factor: float - :param weight_add_node: Relative likelihood for mutation to add a node - :type weight_add_node: float - :param weight_insert_node: Relative likelihood for mutation to insert a node - :type weight_insert_node: float - :param weight_delete_node: Relative likelihood for mutation to delete a node - :type weight_delete_node: float - :param weight_do_nothing: Relative likelihood for mutation to leave the individual - :type weight_do_nothing: float - :param weight_mutate_constant: Relative likelihood for mutation to change the constant slightly in a random direction. - :type weight_mutate_constant: float - :param weight_mutate_operator: Relative likelihood for mutation to swap an operator. - :type weight_mutate_operator: float - :param weight_randomize: Relative likelihood for mutation to completely delete and then randomly generate the equation - :type weight_randomize: float - :param weight_simplify: Relative likelihood for mutation to simplify constant parts by evaluation - :type weight_simplify: float - :param crossover_probability: Absolute probability of crossover-type genetic operation, instead of a mutation. - :type crossover_probability: float - :param equation_file: Where to save the files (.csv separated by |) - :type equation_file: str - :param verbosity: What verbosity level to use. 0 means minimal print statements. - :type verbosity: int - :param update_verbosity: What verbosity level to use for package updates. Will take value of `verbosity` if not given. - :type update_verbosity: int - :param progress: Whether to use a progress bar instead of printing to stdout. - :type progress: bool - :param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant. - :type maxdepth: int - :param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient. - :type fast_cycle: bool - :param variable_names: a list of names for the variables, other than "x0", "x1", etc. - :type variable_names: list - :param warmup_maxsize_by: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize. - :type warmup_maxsize_by: float - :param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions. - :type constraints: dict - :param nested_constraints: Specifies how many times a combination of operators can be nested. For example, - `{"sin": {"cos": 0}}, "cos": {"cos": 2}}` specifies that `cos` may never appear within a `sin`, - but `sin` can be nested with itself an unlimited number of times. The second term specifies that `cos` - can be nested up to 2 times within a `cos`, so that `cos(cos(cos(x)))` is allowed (as well as any combination - of `+` or `-` within it), but `cos(cos(cos(cos(x))))` is not allowed. When an operator is not specified, - it is assumed that it can be nested an unlimited number of times. This requires that there is no operator - which is used both in the unary operators and the binary operators (e.g., `-` could be both subtract, and negation). - For binary operators, you only need to provide a single number: both arguments are treated the same way, - and the max of each argument is constrained. - :type nested_constraints: dict - :param use_frequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities. - :type use_frequency: bool - :param use_frequency_in_tournament: whether to use the frequency mentioned above in the tournament, rather than just the simulated annealing. - :type use_frequency_in_tournament: bool - :param tempdir: directory for the temporary files - :type tempdir: str/None - :param delete_tempfiles: whether to delete the temporary files after finishing - :type delete_tempfiles: bool - :param julia_project: a Julia environment location containing a Project.toml (and potentially the source code for SymbolicRegression.jl). Default gives the Python package directory, where a Project.toml file should be present from the install. - :type julia_project: str/None - :param update: Whether to automatically update Julia packages. - :type update: bool - :param temp_equation_file: Whether to put the hall of fame file in the temp directory. Deletion is then controlled with the delete_tempfiles argument. - :type temp_equation_file: bool - :param output_jax_format: Whether to create a 'jax_format' column in the output, containing jax-callable functions and the default parameters in a jax array. - :type output_jax_format: bool - :param output_torch_format: Whether to create a 'torch_format' column in the output, containing a torch module with trainable parameters. - :type output_torch_format: bool - :param tournament_selection_n: Number of expressions to consider in each tournament. - :type tournament_selection_n: int - :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss. - :type tournament_selection_p: float - :param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well. - :type precision: int - :param skip_mutation_failures: Whether to skip mutation and crossover failures, rather than simply re-sampling the current member. - :type skip_mutation_failures: bool - :param max_evals: Limits the total number of evaluations of expressions to this number. - :type max_evals: int - :param early_stop_condition: Stop the search early if this loss is reached. - :type early_stop_condition: float - :param kwargs: Supports deprecated keyword arguments. Other arguments will result - in an error - :type kwargs: dict - :returns: Initialized model. Call `.fit(X, y)` to fit your data! - :type: PySRRegressor - """ - super().__init__() - # First, check for deprecated kwargs: + + # Hyperparameters + ## Model search parameters + self.model_selection = model_selection + self.binary_operators = binary_operators + self.unary_operators = unary_operators + self.niterations = niterations + self.populations = populations + ## Model search Constraints + self.population_size = population_size + self.max_evals = max_evals + self.maxsize = maxsize + self.maxdepth = maxdepth + self.warmup_maxsize_by = warmup_maxsize_by + self.timeout_in_seconds = timeout_in_seconds + self.constraints = constraints + self.nested_constraints = nested_constraints + ## Loss parameters + self.loss = loss + self.complexity_of_operators = complexity_of_operators + self.complexity_of_constants = complexity_of_constants + self.complexity_of_variables = complexity_of_variables + self.parsimony = float(parsimony) + self.use_frequency = use_frequency + self.use_frequency_in_tournament = use_frequency_in_tournament + self.alpha = alpha + self.annealing = annealing + self.early_stop_condition = early_stop_condition + ## Evolutionary search parameters + ### Mutation parameters + self.ncyclesperiteration = ncyclesperiteration + self.fraction_replaced = fraction_replaced + self.fraction_replaced_hof = fraction_replaced_hof + self.weight_add_node = weight_add_node + self.weight_insert_node = weight_insert_node + self.weight_delete_node = weight_delete_node + self.weight_do_nothing = weight_do_nothing + self.weight_mutate_constant = weight_mutate_constant + self.weight_mutate_operator = weight_mutate_operator + self.weight_randomize = weight_randomize + self.weight_simplify = weight_simplify + self.crossover_probability = crossover_probability + self.skip_mutation_failures = skip_mutation_failures + ### Migration parameters + self.migration = migration + self.hof_migration = hof_migration + self.topn = topn + ### Constants parameters + self.should_optimize_constants = should_optimize_constants + self.optimizer_algorithm = optimizer_algorithm + self.optimizer_nrestarts = optimizer_nrestarts + self.optimize_probability = optimize_probability + self.optimizer_iterations = optimizer_iterations + self.perturbation_factor = perturbation_factor + ### Selection parameters + self.tournament_selection_n = tournament_selection_n + self.tournament_selection_p = tournament_selection_p + # Solver parameters + self.procs = procs + self.multithreading = multithreading + self.cluster_manager = cluster_manager + self.batching = batching + self.batch_size = batch_size + self.fast_cycle = fast_cycle + self.precision = precision + # Additional runtime parameters + ## Runtime user interface + self.verbosity = verbosity + self.update_verbosity = update_verbosity + self.progress = progress + ## Project management + self.equation_file = equation_file + self.temp_equation_file = temp_equation_file + self.tempdir = tempdir + self.delete_tempfiles = delete_tempfiles + self.julia_project = julia_project + self.update = update + self.output_jax_format = output_jax_format + self.output_torch_format = output_torch_format + self.extra_sympy_mappings = extra_sympy_mappings + self.extra_jax_mappings = extra_jax_mappings + self.extra_torch_mappings = extra_torch_mappings + # Pre-modelling transformation + self.denoise = denoise + self.select_k_features = select_k_features + + # Once all valid parameters have been assigned handle the + # deprecated kwargs if len(kwargs) > 0: # pragma: no cover deprecated_kwargs = make_deprecated_kwargs_for_pysr_regressor() for k, v in kwargs.items(): - if k == "fractionReplaced": - fraction_replaced = v - elif k == "fractionReplacedHof": - fraction_replaced_hof = v - elif k == "npop": - population_size = v - elif k == "hofMigration": - hof_migration = v - elif k == "shouldOptimizeConstants": - should_optimize_constants = v - elif k == "weightAddNode": - weight_add_node = v - elif k == "weightDeleteNode": - weight_delete_node = v - elif k == "weightDoNothing": - weight_do_nothing = v - elif k == "weightInsertNode": - weight_insert_node = v - elif k == "weightMutateConstant": - weight_mutate_constant = v - elif k == "weightMutateOperator": - weight_mutate_operator = v - elif k == "weightRandomize": - weight_randomize = v - elif k == "weightSimplify": - weight_simplify = v - elif k == "crossoverProbability": - crossover_probability = v - elif k == "perturbationFactor": - perturbation_factor = v - elif k == "batchSize": - batch_size = v - elif k == "warmupMaxsizeBy": - warmup_maxsize_by = v - elif k == "useFrequency": - use_frequency = v - elif k == "useFrequencyInTournament": - use_frequency_in_tournament = v + # Handle renamed kwargs + if k in deprecated_kwargs: + updated_kwarg_name = deprecated_kwargs[k] + setattr(self, updated_kwarg_name, v) + warnings.warn( + f"{k} has been renamed to {updated_kwarg_name} in PySRRegressor. " + f" Please use that instead.", + FutureWarning, + ) + # Handle kwargs that have been moved to the fit method + elif k in ["weights", "variable_names", "Xresampled"]: + warnings.warn( + f"{k} is a data dependant parameter so should be passed when fit is called. " + f"Ignoring parameter; please pass {k} during the call to fit instead.", + FutureWarning, + ) else: raise TypeError( f"{k} is not a valid keyword argument for PySRRegressor" ) - updated_name = deprecated_kwargs[k] - warnings.warn( - f"{k} has been renamed to {updated_name} in PySRRegressor." - f" Please use that instead.", - ) - self.model_selection = model_selection - - if binary_operators is None: - binary_operators = "+ * - /".split(" ") - if unary_operators is None: - unary_operators = [] - if extra_sympy_mappings is None: - extra_sympy_mappings = {} - if variable_names is None: - variable_names = [] - if constraints is None: - constraints = {} - if multithreading is None: - # Default is multithreading=True, unless explicitly set, - # or procs is set to 0 (serial mode). - multithreading = procs != 0 and cluster_manager is None - if update_verbosity is None: - update_verbosity = verbosity - - buffer_available = "buffer" in sys.stdout.__dir__() - - if progress is not None: - if progress and not buffer_available: - warnings.warn( - "Note: it looks like you are running in Jupyter. The progress bar will be turned off." - ) - progress = False - else: - progress = buffer_available - - assert optimizer_algorithm in ["NelderMead", "BFGS"] - assert tournament_selection_n < population_size - - if extra_jax_mappings is not None: - for value in extra_jax_mappings.values(): - if not isinstance(value, str): - raise NotImplementedError( - "extra_jax_mappings must have keys that are strings! e.g., {sympy.sqrt: 'jnp.sqrt'}." - ) - else: - extra_jax_mappings = {} - - if extra_torch_mappings is not None: - for value in extra_jax_mappings.values(): - if not callable(value): - raise NotImplementedError( - "extra_torch_mappings must be callable functions! e.g., {sympy.sqrt: torch.sqrt}." - ) - else: - extra_torch_mappings = {} - - if maxsize > 40: - warnings.warn( - "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory." - ) - elif maxsize < 7: - raise NotImplementedError("PySR requires a maxsize of at least 7") - - if maxdepth is None: - maxdepth = maxsize - - if isinstance(binary_operators, str): - binary_operators = [binary_operators] - if isinstance(unary_operators, str): - unary_operators = [unary_operators] - - self.params = { - **dict( - weights=weights, - binary_operators=binary_operators, - unary_operators=unary_operators, - procs=procs, - loss=loss, - complexity_of_operators=complexity_of_operators, - complexity_of_constants=complexity_of_constants, - complexity_of_variables=complexity_of_variables, - populations=populations, - niterations=niterations, - ncyclesperiteration=ncyclesperiteration, - timeout_in_seconds=timeout_in_seconds, - alpha=alpha, - annealing=annealing, - fraction_replaced=fraction_replaced, - fraction_replaced_hof=fraction_replaced_hof, - population_size=population_size, - parsimony=float(parsimony), - migration=migration, - hof_migration=hof_migration, - should_optimize_constants=should_optimize_constants, - topn=topn, - weight_add_node=weight_add_node, - weight_insert_node=weight_insert_node, - weight_delete_node=weight_delete_node, - weight_do_nothing=weight_do_nothing, - weight_mutate_constant=weight_mutate_constant, - weight_mutate_operator=weight_mutate_operator, - weight_randomize=weight_randomize, - weight_simplify=weight_simplify, - crossover_probability=crossover_probability, - perturbation_factor=perturbation_factor, - verbosity=verbosity, - update_verbosity=update_verbosity, - progress=progress, - maxsize=maxsize, - fast_cycle=fast_cycle, - maxdepth=maxdepth, - batching=batching, - batch_size=batch_size, - select_k_features=select_k_features, - warmup_maxsize_by=warmup_maxsize_by, - constraints=constraints, - nested_constraints=nested_constraints, - use_frequency=use_frequency, - use_frequency_in_tournament=use_frequency_in_tournament, - tempdir=tempdir, - delete_tempfiles=delete_tempfiles, - update=update, - temp_equation_file=temp_equation_file, - optimizer_algorithm=optimizer_algorithm, - optimizer_nrestarts=optimizer_nrestarts, - optimize_probability=optimize_probability, - optimizer_iterations=optimizer_iterations, - tournament_selection_n=tournament_selection_n, - tournament_selection_p=tournament_selection_p, - denoise=denoise, - Xresampled=Xresampled, - precision=precision, - multithreading=multithreading, - cluster_manager=cluster_manager, - skip_mutation_failures=skip_mutation_failures, - max_evals=max_evals, - early_stop_condition=early_stop_condition, - ), - } - - # Stored equations: - self.equations = None - self.params_hash = None - self.raw_julia_state = None - - self.multioutput = None - self.equation_file = equation_file - self.n_features = None - self.extra_sympy_mappings = extra_sympy_mappings - self.extra_torch_mappings = extra_torch_mappings - self.extra_jax_mappings = extra_jax_mappings - self.output_jax_format = output_jax_format - self.output_torch_format = output_torch_format - self.nout = 1 - self.selection = None - self.variable_names = variable_names - self.julia_project = julia_project - - self.surface_parameters = [ - "model_selection", - "multioutput", - "equation_file", - "n_features", - "extra_sympy_mappings", - "extra_torch_mappings", - "extra_jax_mappings", - "output_jax_format", - "output_torch_format", - "nout", - "selection", - "variable_names", - "julia_project", - ] - def __repr__(self): """Prints all current equations fitted by the model. The string `>>>>` denotes which equation is selected by the `model_selection`. """ - if not hasattr(self, "equations") or self.equations is None: - return "PySRRegressor.equations = None" + if not hasattr(self, "equations_") or self.equations_ is None: + return "PySRRegressor.equations_ = None" - output = "PySRRegressor.equations = [\n" + output = "PySRRegressor.equations_ = [\n" - equations = self.equations + equations = self.equations_ if not isinstance(equations, list): all_equations = [equations] else: @@ -815,345 +827,311 @@ class PySRRegressor(BaseEstimator, RegressorMixin): output += "]" return output - def set_params(self, **params): - """Set parameters for equation search.""" - for key, value in params.items(): - if key in self.surface_parameters: - self.__setattr__(key, value) - elif key in self.params: - self.params[key] = value - else: - raise ValueError(f"Parameter {key} is not in the list of parameters.") - - return self - - def get_params(self, deep=True): - """Get parameters for equation search.""" - del deep - return { - **self.params, - **{key: self.__getattribute__(key) for key in self.surface_parameters}, - } - def get_best(self, index=None): - """Get best equation using `model_selection`. - - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the row number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: Dictionary representing the best expression found. - :type: pd.Series """ - if self.equations is None: + Get best equation using `model_selection`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from `self.equations_`, + give the row number here. This overrides the :param`model_selection` + parameter. + + Returns + ------- + best_equation : pandas.Series + Dictionary representing the best expression found. + + Raises + ------ + NotImplementedError + Raised when an invalid model selection strategy is provided. + """ + if self.equations_ is None: raise ValueError("No equations have been generated yet.") if index is not None: - if isinstance(self.equations, list): + if isinstance(self.equations_, list): assert isinstance(index, list) - return [eq.iloc[i] for eq, i in zip(self.equations, index)] - return self.equations.iloc[index] + return [eq.iloc[i] for eq, i in zip(self.equations_, index)] + return self.equations_.iloc[index] if self.model_selection == "accuracy": - if isinstance(self.equations, list): - return [eq.iloc[-1] for eq in self.equations] - return self.equations.iloc[-1] + if isinstance(self.equations_, list): + return [eq.iloc[-1] for eq in self.equations_] + return self.equations_.iloc[-1] elif self.model_selection == "best": - if isinstance(self.equations, list): - return [eq.iloc[eq["score"].idxmax()] for eq in self.equations] - return self.equations.iloc[self.equations["score"].idxmax()] + if isinstance(self.equations_, list): + return [eq.iloc[eq["score"].idxmax()] for eq in self.equations_] + return self.equations_.iloc[self.equations_["score"].idxmax()] else: raise NotImplementedError( f"{self.model_selection} is not a valid model selection strategy." ) - def fit(self, X, y, weights=None, variable_names=None): - """Search for equations to fit the dataset and store them in `self.equations`. - - :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces). - :type X: np.ndarray/pandas.DataFrame - :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y. - :type y: np.ndarray - :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y. - :type weights: np.ndarray - :param variable_names: a list of names for the variables, other than "x0", "x1", etc. - You can also pass a pandas DataFrame for X. - :type variable_names: list + def _validate_params(self, n_samples): """ - if variable_names is None: - variable_names = self.variable_names - - self._run( - X=X, - y=y, - weights=weights, - variable_names=variable_names, - ) - - return self - - def refresh(self): - # Updates self.equations with any new options passed, - # such as extra_sympy_mappings. - self.equations = self.get_hof() - - def predict(self, X, index=None): - """Predict y from input X using the equation chosen by `model_selection`. - - You may see what equation is used by printing this object. X should have the same - columns as the training data. + Perform validation on the parameters defined in init for the + dataset specified in :term`fit`. + + Parameters + ---------- + n_samples : int + Number of samples in the dataset to be fitted. + + Returns + ------- + self : object + Reference to `self` with validated parameters. + + Raises + ------ + ValueError + Raised when on of the following occours: `tournament_selection_n` + parameter is larger than `population_size`; `maxsize` is + less than 7; invalid `extra_jax_mappings` or + `extra_torch_mappings`; invalid optimizer algorithms. - :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces). - :type X: np.ndarray/pandas.DataFrame - :param index: Optional. If you want to compute the output of - an expression using a particular row of - `self.equations`, you may specify the index here. - :type index: int - :returns: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). - :type: np.ndarray """ - self.refresh() - best = self.get_best(index=index) - try: - if self.multioutput: - return np.stack([eq["lambda_format"](X) for eq in best], axis=1) - return best["lambda_format"](X) - except Exception as error: - # Add extra information to the error, to say that the user - # should try to adjust extra_sympy_params. - raise ValueError( - "Failed to evaluate the expression. " - "If you are using a custom operator, make sure to define it in extra_sympy_mappings, " - "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`." - ) from error - - def sympy(self, index=None): - """Return sympy representation of the equation(s) chosen by `model_selection`. - - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the index number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: SymPy representation of the best expression. - """ - self.refresh() - best = self.get_best(index=index) - if self.multioutput: - return [eq["sympy_format"] for eq in best] - return best["sympy_format"] - - def latex(self, index=None): - """Return latex representation of the equation(s) chosen by `model_selection`. - - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the index number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: LaTeX expression as a string - :type: str - """ - self.refresh() - sympy_representation = self.sympy(index=index) - if self.multioutput: - return [sympy.latex(s) for s in sympy_representation] - return sympy.latex(sympy_representation) - - def jax(self, index=None): - """Return jax representation of the equation(s) chosen by `model_selection`. + # Handle None values for instance parameters: + if self.multithreading is None: + # Default is multithreading=True, unless explicitly set, + # or procs is set to 0 (serial mode). + self.multithreading = self.procs != 0 and self.cluster_manager is None + if self.update_verbosity is None: + self.update_verbosity = self.verbosity + if self.maxdepth is None: + self.maxdepth = self.maxsize + + # Cast tempdir string as a Path object + self.tempdir_ = Path(tempfile.mkdtemp(dir=self.tempdir)) + if self.temp_equation_file: + self.equation_file = self.tempdir_ / "hall_of_fame.csv" + elif self.equation_file is None: + date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3] + self.equation_file = "hall_of_fame_" + date_time + ".csv" - Each equation (multiple given if there are multiple outputs) is a dictionary - containing {"callable": func, "parameters": params}. To call `func`, pass - func(X, params). This function is differentiable using `jax.grad`. + # Handle type conversion for instance parameters: + if isinstance(self.binary_operators, str): + self.binary_operators = [self.binary_operators] + if isinstance(self.unary_operators, str): + self.unary_operators = [self.unary_operators] - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the index number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: Dictionary of callable jax function in "callable" key, - and jax array of parameters as "parameters" key. - :type: dict - """ - if self.using_pandas: + # Warn if instance parameters are not sensible values: + if self.batch_size < 1: warnings.warn( - "PySR's JAX modules are not set up to work with a " - "model that was trained on pandas dataframes. " - "Train on an array instead to ensure everything works as planned." + f"Given :param`batch_size` must be greater than or equal to one." + f":param`batch_size` has been increased to equal one." ) - self.set_params(output_jax_format=True) - self.refresh() - best = self.get_best(index=index) - if self.multioutput: - return [eq["jax_format"] for eq in best] - return best["jax_format"] + self.batch_size = 1 - def pytorch(self, index=None): - """Return pytorch representation of the equation(s) chosen by `model_selection`. - - Each equation (multiple given if there are multiple outputs) is a PyTorch module - containing the parameters as trainable attributes. You can use the module like - any other PyTorch module: `module(X)`, where `X` is a tensor with the same - column ordering as trained with. + if n_samples > 10000 and not self.batching: + warnings.warn( + """ + Note: you are running with more than 10,000 datapoints. + You should consider turning on batching (https://astroautomata.com/PySR/#/options?id=batching). + You should also reconsider if you need that many datapoints. + Unless you have a large amount of noise (in which case you + should smooth your dataset first), generally < 10,000 datapoints + is enough to find a functional form with symbolic regression. + More datapoints will lower the search speed." + """, + ) + # Ensure instance parameters are allowable values: + # ValueError - Incompatible values + if not (self.tournament_selection_n < self.population_size): + raise ValueError( + f"tournament_selection_n parameter must be smaller than population_size" + ) - :param index: Optional. If you wish to select a particular equation - from `self.equations`, give the row number here. This overrides - the `model_selection` parameter. - :type index: int - :returns: PyTorch module representing the expression. - :type: torch.nn.Module - """ - if self.using_pandas: + if self.maxsize > 40: warnings.warn( - "PySR's PyTorch modules are not set up to work with a " - "model that was trained on pandas dataframes. " - "Train on an array instead to ensure everything works as planned." + "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory. You should consider turning `use_frequency` to False, and perhaps use `warmup_maxsize_by`." ) - self.set_params(output_torch_format=True) - self.refresh() - best = self.get_best(index=index) - if self.multioutput: - return [eq["torch_format"] for eq in best] - return best["torch_format"] + elif self.maxsize < 7: + raise ValueError(f"PySR requires a maxsize of at least 7") - def reset(self): - """Reset the search state.""" - self.equations = None - self.params_hash = None - self.raw_julia_state = None - self.variable_names = None - self.selection = None - - def _run(self, X, y, weights, variable_names): - global already_ran - global Main + if self.extra_jax_mappings is not None: + for value in self.extra_jax_mappings.values(): + if not isinstance(value, str): + raise ValueError( + "extra_jax_mappings must have keys that are strings! e.g., {sympy.sqrt: 'jnp.sqrt'}." + ) + else: + self.extra_jax_mappings = {} - for key in self.surface_parameters: - if key in self.params: - raise ValueError( - f"{key} is a surface parameter, and cannot be in self.params" - ) + if self.extra_torch_mappings is not None: + for value in self.extra_jax_mappings.values(): + if not callable(value): + raise ValueError( + "extra_torch_mappings must be callable functions! e.g., {sympy.sqrt: torch.sqrt}." + ) + else: + self.extra_torch_mappings = {} - multithreading = self.params["multithreading"] - cluster_manager = self.params["cluster_manager"] - procs = self.params["procs"] - binary_operators = self.params["binary_operators"] - unary_operators = self.params["unary_operators"] - batching = self.params["batching"] - maxsize = self.params["maxsize"] - select_k_features = self.params["select_k_features"] - Xresampled = self.params["Xresampled"] - denoise = self.params["denoise"] - constraints = self.params["constraints"] - update = self.params["update"] - loss = self.params["loss"] - weight_mutate_constant = self.params["weight_mutate_constant"] - weight_mutate_operator = self.params["weight_mutate_operator"] - weight_add_node = self.params["weight_add_node"] - weight_insert_node = self.params["weight_insert_node"] - weight_delete_node = self.params["weight_delete_node"] - weight_simplify = self.params["weight_simplify"] - weight_randomize = self.params["weight_randomize"] - weight_do_nothing = self.params["weight_do_nothing"] + # NotImplementedError - Currently incompatible values that could be supported later + if self.optimizer_algorithm not in self.VALID_OPTIMIZER_ALGORITHMS: + raise NotImplementedError( + f"PySR currently only supports the following optimizer algorithms: {self.VALID_OPTIMIZER_ALGORITHMS}" + ) - if Main is None: - if multithreading: - os.environ["JULIA_NUM_THREADS"] = str(procs) + # Handle presentation of the progress bar: + buffer_available = "buffer" in sys.stdout.__dir__() + if self.progress is not None: + if self.progress and not buffer_available: + warnings.warn( + "Note: it looks like you are running in Jupyter. The progress bar will be turned off." + ) + self.progress = False + else: + self.progress = buffer_available - Main = init_julia() + return self - if cluster_manager is not None: - Main.eval(f"import ClusterManagers: addprocs_{cluster_manager}") - cluster_manager = Main.eval(f"addprocs_{cluster_manager}") + def _validate_fit_params(self, X, y, Xresampled, variable_names): + """ + Validates the parameters passed to the :term`fit` method. - if isinstance(X, pd.DataFrame): - if variable_names is not None: - warnings.warn("Resetting variable_names from X.columns") + This method also setts the `nout_` attribute. - variable_names = list(X.columns) - X = np.array(X) - self.using_pandas = True - else: - self.using_pandas = False + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. - if len(X.shape) == 1: - X = X[:, None] + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. - if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): - y = np.array(y) + Xresampled : {ndarray | pandas.DataFrame} of shape (n_resampled, n_features), default=None + Resampled training data used for denoising. - if variable_names is None or len(variable_names) == 0: - variable_names = [f"x{i}" for i in range(X.shape[1])] + variable_names : list[str] of length n_features + Names of each variable in the training dataset, `X`. - use_custom_variable_names = len(variable_names) != 0 - # TODO: this is always true. + Returns + ------- + X_validated : ndarray of shape (n_samples, n_features) + Validated training data. - _check_assertions( - X, - binary_operators, - unary_operators, - use_custom_variable_names, - variable_names, - weights, - y, - ) + y_validated : ndarray of shape (n_samples,) or (n_samples, n_targets) + Validatee target data. - self.n_features = X.shape[1] + variable_names_validated : list[str] of length n_features + Validated list of variable names for each feature in `X`. - if len(X) > 10000 and not batching: - warnings.warn( - "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://astroautomata.com/PySR/#/options?id=batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed." - ) + """ - if self.n_features >= 10 and not select_k_features: + if isinstance(X, pd.DataFrame): + variable_names = None warnings.warn( - "Note: you are running with 10 features or more. " - "Genetic algorithms like used in PySR scale poorly with large numbers of features. " - "Consider using feature selection techniques to select the most important features " - "(you can do this automatically with the `select_k_features` parameter), " - "or, alternatively, doing a dimensionality reduction beforehand. " - "For example, `X = PCA(n_components=6).fit_transform(X)`, " - "using scikit-learn's `PCA` class, " - "will reduce the number of features to 6 in an interpretable way, " - "as each resultant feature " - "will be a linear combination of the original features. " + ":param`variable_names` has been reset to `None` as `X` is a DataFrame. " + "Will use DataFrame column names instead." ) - - X, selection = _handle_feature_selection( - X, select_k_features, y, variable_names - ) - + + if X.columns.is_object() and X.columns.str.contains(" ").any(): + X.columns = X.columns.str.replace(" ", "_") + warnings.warn( + "Spaces in DataFrame column names are not supported. " + "Spaces have been replaced with underscores. \n" + "Please rename the columns to valid names." + ) + elif variable_names: + if [" " in name for name in variable_names].any(): + variable_names = [name.replace(" ", "_") for name in variable_names] + warnings.warn( + "Spaces in `variable_names` are not supported. " + "Spaces have been replaced with underscores. \n" + "Please use valid names instead." + ) + # Only numpy values are needed from Xresampled, column metadata is + # provided by X + if isinstance(Xresampled, pd.DataFrame): + Xresampled = Xresampled.values + + # Data validation and feature name fetching via sklearn + # This method sets the n_features_in_ attribute + X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True) + self.feature_names_in_ = _check_feature_names_in(self, variable_names) + variable_names = self.feature_names_in_ + + # Handle multioutput data if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1): - self.multioutput = False - self.nout = 1 y = y.reshape(-1) elif len(y.shape) == 2: - self.multioutput = True - self.nout = y.shape[1] + self.nout_ = y.shape[1] else: raise NotImplementedError("y shape not supported!") - if denoise: - if weights is not None: - raise NotImplementedError( - "No weights for denoising - the weights are learned." - ) + return X, y, variable_names + + def _pre_transform_training_data(self, X, y, Xresampled, variable_names): + """ + Transforms the training data before fitting the symbolic regressor. + + This method also updates/sets the `selection_mask_` attribute. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + Xresampled : {ndarray | pandas.DataFrame} of shape (n_resampled, n_features), default=None + Resampled training data used for denoising. + + variable_names : list[str] of length n_features + Names of each variable in the training dataset, `X`. + + Returns + ------- + X_transformed : ndarray of shape (n_samples, n_features) + Transformed training data. n_samples will be equal to + :param`Xresampled.shape[0]` if :param`self.denoise` is `True`, + and :param`Xresampled is not None`, otherwise it will be + equal to :param`X.shape[0]`. n_features will be equal to + :param`self.select_k_features` if `self.select_k_features is not None`, + otherwise it will be equal to :param`X.shape[1]` + + y_transformed : ndarray of shape (n_samples,) or (n_samples, n_outputs) + Transformed target data. n_samples will be equal to + :param`Xresampled.shape[0]` if :param`self.denoise` is `True`, + and :param`Xresampled is not None`, otherwise it will be + equal to :param`X.shape[0]`. + + variable_names_transformed : list[str] of length n_features + Names of each variable in the transformed dataset, + `X_transformed`. + """ + # Feature selection transformation + if self.select_k_features: + self.selection_mask_ = run_feature_selection(X, y, self.select_k_features) + X = X[:, self.selection_mask_] + if Xresampled is not None: - # Select among only the selected features: - if isinstance(Xresampled, pd.DataFrame): - # Handle Xresampled is pandas dataframe - if selection is not None: - Xresampled = Xresampled[[variable_names[i] for i in selection]] - else: - Xresampled = Xresampled[variable_names] - Xresampled = np.array(Xresampled) - else: - if selection is not None: - Xresampled = Xresampled[:, selection] - if self.multioutput: - y = np.stack( + Xresampled = Xresampled[:, self.selection_mask_] + + # Reduce variable_names to selection + variable_names = [variable_names[i] for i in self.selection_mask_] + + # Re-perform data validation and feature name updating + X, y_transformed = self._validate_data( + X=X, y=y, reset=True, multi_output=True + ) + # Update feature names with selected variable names + self.feature_names_in_ = _check_feature_names_in(self, variable_names) + print(f"Using features {[name for name in self.feature_names_in_]}") + + # Denoising transformation + if self.denoise: + if self.nout_ > 1: + y_transformed = np.stack( [ _denoise(X, y[:, i], Xresampled=Xresampled)[1] - for i in range(self.nout) + for i in range(self.nout_) ], axis=1, ) @@ -1162,31 +1140,55 @@ class PySRRegressor(BaseEstimator, RegressorMixin): else: X, y = _denoise(X, y, Xresampled=Xresampled) - self.julia_project, is_shared = _get_julia_project(self.julia_project) + return X, y, variable_names - tmpdir = Path(tempfile.mkdtemp(dir=self.params["tempdir"])) + def _run(self, X, y, weights): + """ + Run the symbolic regression fitting process on the julia backend. - if self.params["temp_equation_file"]: - self.equation_file = tmpdir / "hall_of_fame.csv" - elif self.equation_file is None: - date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3] - self.equation_file = "hall_of_fame_" + date_time + ".csv" + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. - _create_inline_operators( - binary_operators=binary_operators, unary_operators=unary_operators - ) - _handle_constraints( - binary_operators=binary_operators, - unary_operators=unary_operators, - constraints=constraints, - ) + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + weights : {ndarray | pandas.DataFrame} of the same shape as y, default=None + Each element is how to weight the mean-square-error loss + for that particular element of y. + + Returns + ------- + self : object + Reference to `self` with fitted attributes. + + Raises + ------ + ImportError + Raised when the julia backend fails to import a package. + """ + + # Need to be global as we don't want to recreate/reinstate julia for every new instance of PySRRegressor + global already_ran + global Main - una_constraints = [constraints[op] for op in unary_operators] - bin_constraints = [constraints[op] for op in binary_operators] + # Start julia backend processes + if Main is None: + if self.multithreading: + os.environ["JULIA_NUM_THREADS"] = str(self.procs) + + Main = init_julia() + + if self.cluster_manager is not None: + Main.eval(f"import ClusterManagers: addprocs_{self.cluster_manager}") + self.cluster_manager = Main.eval(f"addprocs_{self.cluster_manager}") + + self.julia_project, is_shared = _get_julia_project(self.julia_project) if not already_ran: Main.eval("using Pkg") - io = "devnull" if self.params["update_verbosity"] == 0 else "stderr" + io = "devnull" if self.update_verbosity == 0 else "stderr" io_arg = f"io={io}" if is_julia_version_greater_eq(Main, "1.6") else "" Main.eval( @@ -1199,7 +1201,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin): _add_sr_to_julia_project(Main, io_arg) try: - if update: + if self.update: Main.eval(f"Pkg.resolve({io_arg})") Main.eval(f"Pkg.instantiate({io_arg})") else: @@ -1214,129 +1216,108 @@ class PySRRegressor(BaseEstimator, RegressorMixin): Main.pow = Main.eval("(^)") Main.div = Main.eval("(/)") - nested_constraints = self.params["nested_constraints"] + _create_inline_operators( + binary_operators=self.binary_operators, unary_operators=self.unary_operators + ) + _handle_constraints( + binary_operators=self.binary_operators, + unary_operators=self.unary_operators, + constraints=self.constraints, + ) + + una_constraints = [self.constraints[op] for op in self.unary_operators] + bin_constraints = [self.constraints[op] for op in self.binary_operators] + # Parse dict into Julia Dict for nested constraints:: - if nested_constraints is not None: + if self.nested_constraints is not None: nested_constraints_str = "Dict(" - for outer_k, outer_v in nested_constraints.items(): + for outer_k, outer_v in self.nested_constraints.items(): nested_constraints_str += f"({outer_k}) => Dict(" for inner_k, inner_v in outer_v.items(): nested_constraints_str += f"({inner_k}) => {inner_v}, " nested_constraints_str += "), " nested_constraints_str += ")" - nested_constraints = Main.eval(nested_constraints_str) + self.nested_constraints = Main.eval(nested_constraints_str) # Parse dict into Julia Dict for complexities: - complexity_of_operators = self.params["complexity_of_operators"] - if complexity_of_operators is not None: + if self.complexity_of_operators is not None: complexity_of_operators_str = "Dict(" - for k, v in complexity_of_operators.items(): + for k, v in self.complexity_of_operators.items(): complexity_of_operators_str += f"({k}) => {v}, " complexity_of_operators_str += ")" - complexity_of_operators = Main.eval(complexity_of_operators_str) + self.complexity_of_operators = Main.eval(complexity_of_operators_str) - Main.custom_loss = Main.eval(loss) + Main.custom_loss = Main.eval(self.loss) mutationWeights = [ - float(weight_mutate_constant), - float(weight_mutate_operator), - float(weight_add_node), - float(weight_insert_node), - float(weight_delete_node), - float(weight_simplify), - float(weight_randomize), - float(weight_do_nothing), + float(self.weight_mutate_constant), + float(self.weight_mutate_operator), + float(self.weight_add_node), + float(self.weight_insert_node), + float(self.weight_delete_node), + float(self.weight_simplify), + float(self.weight_randomize), + float(self.weight_do_nothing), ] - params_to_hash = { - **{k: self.__getattribute__(k) for k in self.surface_parameters}, - **self.params, - } - params_excluded_from_hash = [ - "niterations", - ] - # Delete these^ from params_to_hash: - params_to_hash = { - k: v - for k, v in params_to_hash.items() - if k not in params_excluded_from_hash - } - - # Sort params_to_hash by key: - params_to_hash = OrderedDict(sorted(params_to_hash.items())) - # Hash all parameters: - cur_hash = sha256(str(params_to_hash).encode()).hexdigest() - - if self.params_hash is not None: - if cur_hash != self.params_hash: - warnings.warn( - "Warning: PySR options have changed since the last run. " - "This is experimental and may not work. " - "For example, if the operators change, or even their order," - " the saved equations will be in the wrong format." - "\n\n" - "To reset the search state, run `.reset()`. " - ) - - self.params_hash = cur_hash - # Call to Julia backend. # See https://github.com/search?q=%22function+Options%22+repo%3AMilesCranmer%2FSymbolicRegression.jl+path%3A%2Fsrc%2F+filename%3AOptions.jl+language%3AJulia&type=Code options = Main.Options( - binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")), - unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")), + binary_operators=Main.eval( + str(tuple(self.binary_operators)).replace("'", "") + ), + unary_operators=Main.eval( + str(tuple(self.unary_operators)).replace("'", "") + ), bin_constraints=bin_constraints, una_constraints=una_constraints, - complexity_of_operators=complexity_of_operators, - complexity_of_constants=self.params["complexity_of_constants"], - complexity_of_variables=self.params["complexity_of_variables"], - nested_constraints=nested_constraints, + complexity_of_operators=self.complexity_of_operators, + complexity_of_constants=self.complexity_of_constants, + complexity_of_variables=self.complexity_of_variables, + nested_constraints=self.nested_constraints, loss=Main.custom_loss, - maxsize=int(maxsize), + maxsize=int(self.maxsize), hofFile=_escape_filename(self.equation_file), - npopulations=int(self.params["populations"]), - batching=batching, - batchSize=int( - min([self.params["batch_size"], len(X)]) if batching else len(X) - ), + npopulations=int(self.populations), + batching=self.batching, + batchSize=int(min([self.batch_size, len(X)]) if self.batching else len(X)), mutationWeights=mutationWeights, - probPickFirst=self.params["tournament_selection_p"], - ns=self.params["tournament_selection_n"], + probPickFirst=self.tournament_selection_p, + ns=self.tournament_selection_n, # These have the same name: - parsimony=self.params["parsimony"], - alpha=self.params["alpha"], - maxdepth=self.params["maxdepth"], - fast_cycle=self.params["fast_cycle"], - migration=self.params["migration"], - hofMigration=self.params["hof_migration"], - fractionReplacedHof=self.params["fraction_replaced_hof"], - shouldOptimizeConstants=self.params["should_optimize_constants"], - warmupMaxsizeBy=self.params["warmup_maxsize_by"], - useFrequency=self.params["use_frequency"], - useFrequencyInTournament=self.params["use_frequency_in_tournament"], - npop=self.params["population_size"], - ncyclesperiteration=self.params["ncyclesperiteration"], - fractionReplaced=self.params["fraction_replaced"], - topn=self.params["topn"], - verbosity=self.params["verbosity"], - optimizer_algorithm=self.params["optimizer_algorithm"], - optimizer_nrestarts=self.params["optimizer_nrestarts"], - optimize_probability=self.params["optimize_probability"], - optimizer_iterations=self.params["optimizer_iterations"], - perturbationFactor=self.params["perturbation_factor"], - annealing=self.params["annealing"], + parsimony=self.parsimony, + alpha=self.alpha, + maxdepth=self.maxdepth, + fast_cycle=self.fast_cycle, + migration=self.migration, + hofMigration=self.hof_migration, + fractionReplacedHof=self.fraction_replaced_hof, + shouldOptimizeConstants=self.should_optimize_constants, + warmupMaxsizeBy=self.warmup_maxsize_by, + useFrequency=self.use_frequency, + useFrequencyInTournament=self.use_frequency_in_tournament, + npop=self.population_size, + ncyclesperiteration=self.ncyclesperiteration, + fractionReplaced=self.fraction_replaced, + topn=self.topn, + verbosity=self.verbosity, + optimizer_algorithm=self.optimizer_algorithm, + optimizer_nrestarts=self.optimizer_nrestarts, + optimize_probability=self.optimize_probability, + optimizer_iterations=self.optimizer_iterations, + perturbationFactor=self.perturbation_factor, + annealing=self.annealing, stateReturn=True, # Required for state saving. - progress=self.params["progress"], - timeout_in_seconds=self.params["timeout_in_seconds"], - crossoverProbability=self.params["crossover_probability"], - skip_mutation_failures=self.params["skip_mutation_failures"], - max_evals=self.params["max_evals"], - earlyStopCondition=self.params["early_stop_condition"], + progress=self.progress, + timeout_in_seconds=self.timeout_in_seconds, + crossoverProbability=self.crossover_probability, + skip_mutation_failures=self.skip_mutation_failures, + max_evals=self.max_evals, + earlyStopCondition=self.early_stop_condition, ) - np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[ - self.params["precision"] - ] + # Convert data to desired precision + np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self.precision] Main.X = np.array(X, dtype=np_dtype).T if len(y.shape) == 1: @@ -1351,48 +1332,312 @@ class PySRRegressor(BaseEstimator, RegressorMixin): else: Main.weights = None - cprocs = 0 if multithreading else procs + cprocs = 0 if self.multithreading else self.procs # Call to Julia backend. # See https://github.com/search?q=%22function+EquationSearch%22+repo%3AMilesCranmer%2FSymbolicRegression.jl+path%3A%2Fsrc%2F+filename%3ASymbolicRegression.jl+language%3AJulia&type=Code - self.raw_julia_state = Main.EquationSearch( + self.raw_julia_state_ = Main.EquationSearch( Main.X, Main.y, weights=Main.weights, - niterations=int(self.params["niterations"]), - varMap=( - variable_names - if selection is None - else [variable_names[i] for i in selection] - ), + niterations=int(self.niterations), + varMap=self.feature_names_in_.tolist(), options=options, numprocs=int(cprocs), - multithreading=bool(multithreading), - saved_state=self.raw_julia_state, - addprocs_function=cluster_manager, + multithreading=bool(self.multithreading), + saved_state=self.raw_julia_state_, + addprocs_function=self.cluster_manager, ) - self.variable_names = variable_names - self.selection = selection + # Set attributes + self.equations_ = self.get_hof() - # Not in params: - # selection, variable_names, multioutput + if self.delete_tempfiles: + shutil.rmtree(self.tempdir_) - self.equations = self.get_hof() + already_ran = True - if self.params["delete_tempfiles"]: - shutil.rmtree(tmpdir) + return self - already_ran = True + def fit( + self, + X, + y, + Xresampled=None, + weights=None, + variable_names=None, + from_equation_file=False, + ): + """ + Search for equations to fit the dataset and store them in `self.equations_`. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + + y : {ndarray | pandas.DataFrame} of shape (n_samples,) or (n_samples, n_targets) + Target values. Will be cast to X's dtype if necessary. + + Xresampled : {ndarray | pandas.DataFrame} of shape (n_resampled, n_features), default=None + Resampled training data used for denoising. + + weights : {ndarray | pandas.DataFrame} of the same shape as y, default=None + Each element is how to weight the mean-square-error loss + for that particular element of y. + + variable_names : list[str], default=None + A list of names for the variables, other than "x0", "x1", etc. + If :param`X` is a pandas dataframe, the column names will be used. + If variable_names are specified + + from_equation_file : bool, default=False + Allows model to be initialized/fit from a previous run that has + been saved to a file. If true, a value of y still needs to be + passed such that `nout_` can be determined, however, the values of + y are irrelevant and can be all zeros. + + Returns + ------- + self : object + Fitted Estimator. + """ + + # Init attributes that are not specified in BaseEstimator + self.equations_ = None + self.nout_ = 1 + self.selection_mask_ = None + self.raw_julia_state_ = None + + # Parameter input validation (for parameters defined in __init__) + self._validate_params(n_samples=X.shape[0]) + X, y, variable_names = self._validate_fit_params( + X, y, Xresampled, variable_names + ) + + # Pre transformations (feature selection and denoising) + X, y, variable_names = self._pre_transform_training_data( + X, y, Xresampled, variable_names + ) + + # Warn about large feature counts (still warn if feature count is large after running feature selection) + if self.n_features_in_ >= 10: + warnings.warn( + "Note: you are running with 10 features or more. " + "Genetic algorithms like used in PySR scale poorly with large numbers of features. " + "Consider using feature selection techniques to select the most important features " + "(you can do this automatically with the `select_k_features` parameter), " + "or, alternatively, doing a dimensionality reduction beforehand. " + "For example, `X = PCA(n_components=6).fit_transform(X)`, " + "using scikit-learn's `PCA` class, " + "will reduce the number of features to 6 in an interpretable way, " + "as each resultant feature " + "will be a linear combination of the original features. " + ) + + # Assertion checks + use_custom_variable_names = variable_names is not None + # TODO: this is always true. + + _check_assertions( + X, + self.binary_operators, + self.unary_operators, + use_custom_variable_names, + variable_names, + weights, + y, + ) + + # Fitting procedure + if not from_equation_file: + self._run(X=X, y=y, weights=weights) + else: + self.equations_ = self.get_hof() + + return self + + def refresh(self): + """ + Updates self.equations_ with any new options passed, such as + :param`extra_sympy_mappings`. + """ + self.equations_ = self.get_hof() + + def _decision_function(self, X, best_equation): + """ + Decide what value to predict based on the 'best' equation found + from fitting. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Testing data for evaluating the model. + + best_equation : pd.Series + Selected best equation from `self.equations_`. + + Returns + ------- + y_predicted : ndarray of shape (n_samples,) or (n_samples, nout_) + Values predicted by substituting `X` into the + :param`best_equation`. + + Raises + ------ + ValueError + Raises if the `best_equation` cannot be evaluated. + """ + check_is_fitted(self) + + if isinstance(X, pd.DataFrame): + X = X[self.feature_names_in_] + elif self.selection_mask_ is not None: + X = X[:, self.selection_mask_] + + X = self._validate_data(X, reset=False) + try: + if self.nout_ > 1: + return np.stack( + [eq["lambda_format"](X) for eq in best_equation], axis=1 + ) + return best_equation["lambda_format"](X) + except Exception as error: + raise ValueError( + "Failed to evaluate the expression. " + "If you are using a custom operator, make sure to define it in :param`extra_sympy_mappings`, " + "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`." + ) from error + + def predict(self, X, index=None): + """Predict y from input X using the equation chosen by `model_selection`. + + You may see what equation is used by printing this object. X should have the same + columns as the training data. + + Parameters + ---------- + X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features) + Training data. + + index : int, default=None + If you want to compute the output of an expression using a + particular row of `self.equations_`, you may specify the index here. + + Returns + ------- + y_predicted : ndarray of shape (n_samples, nout_) + Values predicted by substituting `X` into the fitted symbolic + regression model. + """ + self.refresh() + best_equation = self.get_best(index=index) + return self._decision_function(X, best_equation) + + def sympy(self, index=None): + """Return sympy representation of the equation(s) chosen by `model_selection`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the index number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : str, list[str] of length nout_ + SymPy representation of the best equation. + """ + self.refresh() + best = self.get_best(index=index) + if self.nout_ > 1: + return [eq["sympy_format"] for eq in best] + return best["sympy_format"] + + def latex(self, index=None): + """Return latex representation of the equation(s) chosen by `model_selection`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the index number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : str or list[str] of length nout_ + LaTeX expression of the best equation. + """ + self.refresh() + sympy_representation = self.sympy(index=index) + if self.nout_ > 1: + return [sympy.latex(s) for s in sympy_representation] + return sympy.latex(sympy_representation) + + def jax(self, index=None): + """Return jax representation of the equation(s) chosen by `model_selection`. + + Each equation (multiple given if there are multiple outputs) is a dictionary + containing {"callable": func, "parameters": params}. To call `func`, pass + func(X, params). This function is differentiable using `jax.grad`. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the row number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : dict[str, Any] + Dictionary of callable jax function in "callable" key, + and jax array of parameters as "parameters" key. + """ + + self.set_params(output_jax_format=True) + self.refresh() + best = self.get_best(index=index) + if self.nout_ > 1: + return [eq["jax_format"] for eq in best] + return best["jax_format"] + + def pytorch(self, index=None): + """Return pytorch representation of the equation(s) chosen by `model_selection`. + + Each equation (multiple given if there are multiple outputs) is a PyTorch module + containing the parameters as trainable attributes. You can use the module like + any other PyTorch module: `module(X)`, where `X` is a tensor with the same + column ordering as trained with. + + Parameters + ---------- + index : int, default=None + If you wish to select a particular equation from + `self.equations_`, give the row number here. This overrides + the `model_selection` parameter. + + Returns + ------- + best_equation : torch.nn.Module + PyTorch module representing the expression. + """ + self.set_params(output_torch_format=True) + self.refresh() + best = self.get_best(index=index) + if self.nout_ > 1: + return [eq["torch_format"] for eq in best] + return best["torch_format"] def get_hof(self): """Get the equations from a hall of fame file. If no arguments entered, the ones used previously from a call to PySR will be used.""" - try: - if self.multioutput: + if self.nout_ > 1: all_outputs = [] - for i in range(1, self.nout + 1): + for i in range(1, self.nout_ + 1): df = pd.read_csv( str(self.equation_file) + f".out{i}" + ".bkup", sep="|", @@ -1436,20 +1681,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin): jax_format = [] if self.output_torch_format: torch_format = [] - use_custom_variable_names = len(self.variable_names) != 0 local_sympy_mappings = { **self.extra_sympy_mappings, **sympy_mappings, } - if use_custom_variable_names: - sympy_symbols = [ - sympy.Symbol(self.variable_names[i]) for i in range(self.n_features) - ] - else: - sympy_symbols = [ - sympy.Symbol("x%d" % i) for i in range(self.n_features) - ] + sympy_symbols = [ + sympy.Symbol(self.feature_names_in_[i]) + for i in range(self.n_features_in_) + ] for _, eqn_row in output.iterrows(): eqn = sympify(eqn_row["equation"], locals=local_sympy_mappings) @@ -1458,7 +1698,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin): # Numpy: lambda_format.append( CallableEquation( - sympy_symbols, eqn, self.selection, self.variable_names + sympy_symbols, eqn, self.selection_mask_, self.feature_names_in_ ) ) @@ -1469,7 +1709,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin): func, params = sympy2jax( eqn, sympy_symbols, - selection=self.selection, extra_jax_mappings=self.extra_jax_mappings, ) jax_format.append({"callable": func, "parameters": params}) @@ -1481,7 +1720,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin): module = sympy2torch( eqn, sympy_symbols, - selection=self.selection, extra_torch_mappings=self.extra_torch_mappings, ) torch_format.append(module) @@ -1523,11 +1761,48 @@ class PySRRegressor(BaseEstimator, RegressorMixin): ret_outputs.append(output[output_cols]) - if self.multioutput: + if self.nout_ > 1: return ret_outputs return ret_outputs[0] - def score(self, X, y): - del X - del y - raise NotImplementedError + +def _denoise(X, y, Xresampled=None): + """Denoise the dataset using a Gaussian process""" + from sklearn.gaussian_process import GaussianProcessRegressor + from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel + + gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel() + gpr = GaussianProcessRegressor(kernel=gp_kernel, n_restarts_optimizer=50) + gpr.fit(X, y) + if Xresampled is not None: + return Xresampled, gpr.predict(Xresampled) + + return X, gpr.predict(X) + + +# Function hasnot been removed only due to usage in module tests +def _handle_feature_selection(X, select_k_features, y, variable_names): + if select_k_features is not None: + selection = run_feature_selection(X, y, select_k_features) + print(f"Using features {[variable_names[i] for i in selection]}") + X = X[:, selection] + + else: + selection = None + return X, selection + + +def run_feature_selection(X, y, select_k_features): + """Use a gradient boosting tree regressor as a proxy for finding + the k most important features in X, returning indices for those + features as output.""" + + from sklearn.ensemble import RandomForestRegressor + from sklearn.feature_selection import SelectFromModel + + clf = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0) + clf.fit(X, y) + selector = SelectFromModel( + clf, threshold=-np.inf, max_features=select_k_features, prefit=True + ) + return selector.get_support(indices=True)