Spaces:
Running
Running
MilesCranmer
commited on
Merge pull request #609 from MilesCranmer/cleanup
Browse filesMore extensive typing stubs and associated refactoring
- .gitignore +2 -0
- environment.yml +0 -1
- pyproject.toml +6 -0
- pysr/denoising.py +17 -4
- pysr/export_latex.py +12 -0
- pysr/export_numpy.py +10 -2
- pysr/export_sympy.py +7 -5
- pysr/feature_selection.py +19 -3
- pysr/julia_helpers.py +17 -5
- pysr/julia_import.py +5 -0
- pysr/sr.py +260 -156
- pysr/test/test.py +13 -3
- pysr/utils.py +10 -2
- requirements.txt +0 -1
.gitignore
CHANGED
@@ -23,3 +23,5 @@ site
|
|
23 |
**/*.code-workspace
|
24 |
**/*.tar.gz
|
25 |
venv
|
|
|
|
|
|
23 |
**/*.code-workspace
|
24 |
**/*.tar.gz
|
25 |
venv
|
26 |
+
requirements-dev.lock
|
27 |
+
requirements.lock
|
environment.yml
CHANGED
@@ -9,4 +9,3 @@ dependencies:
|
|
9 |
- scikit-learn>=1.0.0,<2.0.0
|
10 |
- pyjuliacall>=0.9.15,<0.10.0
|
11 |
- click>=7.0.0,<9.0.0
|
12 |
-
- typing_extensions>=4.0.0,<5.0.0
|
|
|
9 |
- scikit-learn>=1.0.0,<2.0.0
|
10 |
- pyjuliacall>=0.9.15,<0.10.0
|
11 |
- click>=7.0.0,<9.0.0
|
|
pyproject.toml
CHANGED
@@ -35,4 +35,10 @@ dev-dependencies = [
|
|
35 |
"pre-commit>=3.7.0",
|
36 |
"ipython>=8.23.0",
|
37 |
"ipykernel>=6.29.4",
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
]
|
|
|
35 |
"pre-commit>=3.7.0",
|
36 |
"ipython>=8.23.0",
|
37 |
"ipykernel>=6.29.4",
|
38 |
+
"mypy>=1.10.0",
|
39 |
+
"jax[cpu]>=0.4.26",
|
40 |
+
"torch>=2.3.0",
|
41 |
+
"pandas-stubs>=2.2.1.240316",
|
42 |
+
"types-pytz>=2024.1.0.20240417",
|
43 |
+
"types-openpyxl>=3.1.0.20240428",
|
44 |
]
|
pysr/denoising.py
CHANGED
@@ -1,9 +1,17 @@
|
|
1 |
"""Functions for denoising data during preprocessing."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
|
|
4 |
|
5 |
|
6 |
-
def denoise(
|
|
|
|
|
|
|
|
|
|
|
7 |
"""Denoise the dataset using a Gaussian process."""
|
8 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
9 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
|
|
15 |
gpr.fit(X, y)
|
16 |
|
17 |
if Xresampled is not None:
|
18 |
-
return Xresampled, gpr.predict(Xresampled)
|
19 |
|
20 |
-
return X, gpr.predict(X)
|
21 |
|
22 |
|
23 |
-
def multi_denoise(
|
|
|
|
|
|
|
|
|
|
|
24 |
"""Perform `denoise` along each column of `y` independently."""
|
25 |
y = np.stack(
|
26 |
[
|
|
|
1 |
"""Functions for denoising data during preprocessing."""
|
2 |
|
3 |
+
from typing import Optional, Tuple, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
|
8 |
|
9 |
+
def denoise(
|
10 |
+
X: ndarray,
|
11 |
+
y: ndarray,
|
12 |
+
Xresampled: Optional[ndarray] = None,
|
13 |
+
random_state: Optional[np.random.RandomState] = None,
|
14 |
+
) -> Tuple[ndarray, ndarray]:
|
15 |
"""Denoise the dataset using a Gaussian process."""
|
16 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
17 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
|
23 |
gpr.fit(X, y)
|
24 |
|
25 |
if Xresampled is not None:
|
26 |
+
return Xresampled, cast(ndarray, gpr.predict(Xresampled))
|
27 |
|
28 |
+
return X, cast(ndarray, gpr.predict(X))
|
29 |
|
30 |
|
31 |
+
def multi_denoise(
|
32 |
+
X: ndarray,
|
33 |
+
y: ndarray,
|
34 |
+
Xresampled: Optional[ndarray] = None,
|
35 |
+
random_state: Optional[np.random.RandomState] = None,
|
36 |
+
):
|
37 |
"""Perform `denoise` along each column of `y` independently."""
|
38 |
y = np.stack(
|
39 |
[
|
pysr/export_latex.py
CHANGED
@@ -153,3 +153,15 @@ def sympy2multilatextable(
|
|
153 |
]
|
154 |
|
155 |
return "\n\n".join(latex_tables)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
]
|
154 |
|
155 |
return "\n\n".join(latex_tables)
|
156 |
+
|
157 |
+
|
158 |
+
def with_preamble(table_string: str) -> str:
|
159 |
+
preamble_string = [
|
160 |
+
r"\usepackage{breqn}",
|
161 |
+
r"\usepackage{booktabs}",
|
162 |
+
"",
|
163 |
+
"...",
|
164 |
+
"",
|
165 |
+
table_string,
|
166 |
+
]
|
167 |
+
return "\n".join(preamble_string)
|
pysr/export_numpy.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
"""Code for exporting discovered expressions to numpy"""
|
2 |
|
3 |
import warnings
|
|
|
4 |
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
-
from
|
|
|
8 |
|
9 |
|
10 |
def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
@@ -14,6 +16,10 @@ def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
14 |
class CallableEquation:
|
15 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
16 |
|
|
|
|
|
|
|
|
|
17 |
def __init__(self, eqn, sympy_symbols, selection=None):
|
18 |
self._sympy = eqn
|
19 |
self._sympy_symbols = sympy_symbols
|
@@ -29,8 +35,9 @@ class CallableEquation:
|
|
29 |
return self._lambda(
|
30 |
**{k: X[k].values for k in map(str, self._sympy_symbols)}
|
31 |
) * np.ones(expected_shape)
|
|
|
32 |
if self._selection is not None:
|
33 |
-
if X.shape[1] !=
|
34 |
warnings.warn(
|
35 |
"`X` should be of shape (n_samples, len(self._selection)). "
|
36 |
"Automatically filtering `X` to selection. "
|
@@ -38,6 +45,7 @@ class CallableEquation:
|
|
38 |
"this may lead to incorrect predictions and other errors."
|
39 |
)
|
40 |
X = X[:, self._selection]
|
|
|
41 |
return self._lambda(*X.T) * np.ones(expected_shape)
|
42 |
|
43 |
@property
|
|
|
1 |
"""Code for exporting discovered expressions to numpy"""
|
2 |
|
3 |
import warnings
|
4 |
+
from typing import List, Union
|
5 |
|
6 |
import numpy as np
|
7 |
import pandas as pd
|
8 |
+
from numpy.typing import NDArray
|
9 |
+
from sympy import Expr, Symbol, lambdify
|
10 |
|
11 |
|
12 |
def sympy2numpy(eqn, sympy_symbols, *, selection=None):
|
|
|
16 |
class CallableEquation:
|
17 |
"""Simple wrapper for numpy lambda functions built with sympy"""
|
18 |
|
19 |
+
_sympy: Expr
|
20 |
+
_sympy_symbols: List[Symbol]
|
21 |
+
_selection: Union[NDArray[np.bool_], None]
|
22 |
+
|
23 |
def __init__(self, eqn, sympy_symbols, selection=None):
|
24 |
self._sympy = eqn
|
25 |
self._sympy_symbols = sympy_symbols
|
|
|
35 |
return self._lambda(
|
36 |
**{k: X[k].values for k in map(str, self._sympy_symbols)}
|
37 |
) * np.ones(expected_shape)
|
38 |
+
|
39 |
if self._selection is not None:
|
40 |
+
if X.shape[1] != self._selection.sum():
|
41 |
warnings.warn(
|
42 |
"`X` should be of shape (n_samples, len(self._selection)). "
|
43 |
"Automatically filtering `X` to selection. "
|
|
|
45 |
"this may lead to incorrect predictions and other errors."
|
46 |
)
|
47 |
X = X[:, self._selection]
|
48 |
+
|
49 |
return self._lambda(*X.T) * np.ones(expected_shape)
|
50 |
|
51 |
@property
|
pysr/export_sympy.py
CHANGED
@@ -5,6 +5,8 @@ from typing import Callable, Dict, List, Optional
|
|
5 |
import sympy
|
6 |
from sympy import sympify
|
7 |
|
|
|
|
|
8 |
sympy_mappings = {
|
9 |
"div": lambda x, y: x / y,
|
10 |
"mult": lambda x, y: x * y,
|
@@ -30,8 +32,8 @@ sympy_mappings = {
|
|
30 |
"acosh": lambda x: sympy.acosh(x),
|
31 |
"acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
|
32 |
"asinh": sympy.asinh,
|
33 |
-
"atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
|
34 |
-
"atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
|
35 |
"abs": abs,
|
36 |
"mod": sympy.Mod,
|
37 |
"erf": sympy.erf,
|
@@ -60,13 +62,13 @@ sympy_mappings = {
|
|
60 |
|
61 |
|
62 |
def create_sympy_symbols_map(
|
63 |
-
feature_names_in:
|
64 |
) -> Dict[str, sympy.Symbol]:
|
65 |
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
|
66 |
|
67 |
|
68 |
def create_sympy_symbols(
|
69 |
-
feature_names_in:
|
70 |
) -> List[sympy.Symbol]:
|
71 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
72 |
|
@@ -74,7 +76,7 @@ def create_sympy_symbols(
|
|
74 |
def pysr2sympy(
|
75 |
equation: str,
|
76 |
*,
|
77 |
-
feature_names_in: Optional[
|
78 |
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
|
79 |
):
|
80 |
if feature_names_in is None:
|
|
|
5 |
import sympy
|
6 |
from sympy import sympify
|
7 |
|
8 |
+
from .utils import ArrayLike
|
9 |
+
|
10 |
sympy_mappings = {
|
11 |
"div": lambda x, y: x / y,
|
12 |
"mult": lambda x, y: x * y,
|
|
|
32 |
"acosh": lambda x: sympy.acosh(x),
|
33 |
"acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
|
34 |
"asinh": sympy.asinh,
|
35 |
+
"atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
|
36 |
+
"atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - sympy.S(1)),
|
37 |
"abs": abs,
|
38 |
"mod": sympy.Mod,
|
39 |
"erf": sympy.erf,
|
|
|
62 |
|
63 |
|
64 |
def create_sympy_symbols_map(
|
65 |
+
feature_names_in: ArrayLike[str],
|
66 |
) -> Dict[str, sympy.Symbol]:
|
67 |
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
|
68 |
|
69 |
|
70 |
def create_sympy_symbols(
|
71 |
+
feature_names_in: ArrayLike[str],
|
72 |
) -> List[sympy.Symbol]:
|
73 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
74 |
|
|
|
76 |
def pysr2sympy(
|
77 |
equation: str,
|
78 |
*,
|
79 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
80 |
extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
|
81 |
):
|
82 |
if feature_names_in is None:
|
pysr/feature_selection.py
CHANGED
@@ -1,9 +1,20 @@
|
|
1 |
"""Functions for doing feature selection during preprocessing."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
-
def run_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
Find most important features.
|
9 |
|
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
|
|
21 |
selector = SelectFromModel(
|
22 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
23 |
)
|
24 |
-
return selector.get_support(indices=
|
25 |
|
26 |
|
27 |
# Function has not been removed only due to usage in module tests
|
28 |
-
def _handle_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
29 |
if select_k_features is not None:
|
30 |
selection = run_feature_selection(X, y, select_k_features)
|
31 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
|
|
1 |
"""Functions for doing feature selection during preprocessing."""
|
2 |
|
3 |
+
from typing import Optional, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
+
from numpy import ndarray
|
7 |
+
from numpy.typing import NDArray
|
8 |
+
|
9 |
+
from .utils import ArrayLike
|
10 |
|
11 |
|
12 |
+
def run_feature_selection(
|
13 |
+
X: ndarray,
|
14 |
+
y: ndarray,
|
15 |
+
select_k_features: int,
|
16 |
+
random_state: Optional[np.random.RandomState] = None,
|
17 |
+
) -> NDArray[np.bool_]:
|
18 |
"""
|
19 |
Find most important features.
|
20 |
|
|
|
32 |
selector = SelectFromModel(
|
33 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
34 |
)
|
35 |
+
return cast(NDArray[np.bool_], selector.get_support(indices=False))
|
36 |
|
37 |
|
38 |
# Function has not been removed only due to usage in module tests
|
39 |
+
def _handle_feature_selection(
|
40 |
+
X: ndarray,
|
41 |
+
select_k_features: Optional[int],
|
42 |
+
y: ndarray,
|
43 |
+
variable_names: ArrayLike[str],
|
44 |
+
):
|
45 |
if select_k_features is not None:
|
46 |
selection = run_feature_selection(X, y, select_k_features)
|
47 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
pysr/julia_helpers.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1 |
"""Functions for initializing the Julia environment and installing deps."""
|
2 |
|
|
|
|
|
3 |
import numpy as np
|
4 |
from juliacall import convert as jl_convert # type: ignore
|
|
|
5 |
|
6 |
from .deprecated import init_julia, install
|
7 |
from .julia_import import jl
|
8 |
|
|
|
|
|
9 |
jl.seval("using Serialization: Serialization")
|
10 |
jl.seval("using PythonCall: PythonCall")
|
11 |
|
@@ -22,24 +27,31 @@ def _escape_filename(filename):
|
|
22 |
return str_repr
|
23 |
|
24 |
|
25 |
-
def _load_cluster_manager(cluster_manager):
|
26 |
jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
|
27 |
return jl.seval(f"addprocs_{cluster_manager}")
|
28 |
|
29 |
|
30 |
-
def jl_array(x):
|
31 |
if x is None:
|
32 |
return None
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
-
def jl_serialize(obj):
|
37 |
buf = jl.IOBuffer()
|
38 |
Serialization.serialize(buf, obj)
|
39 |
return np.array(jl.take_b(buf))
|
40 |
|
41 |
|
42 |
-
def jl_deserialize(s):
|
43 |
if s is None:
|
44 |
return s
|
45 |
buf = jl.IOBuffer()
|
|
|
1 |
"""Functions for initializing the Julia environment and installing deps."""
|
2 |
|
3 |
+
from typing import Any, Callable, Union, cast
|
4 |
+
|
5 |
import numpy as np
|
6 |
from juliacall import convert as jl_convert # type: ignore
|
7 |
+
from numpy.typing import NDArray
|
8 |
|
9 |
from .deprecated import init_julia, install
|
10 |
from .julia_import import jl
|
11 |
|
12 |
+
jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
|
13 |
+
|
14 |
jl.seval("using Serialization: Serialization")
|
15 |
jl.seval("using PythonCall: PythonCall")
|
16 |
|
|
|
27 |
return str_repr
|
28 |
|
29 |
|
30 |
+
def _load_cluster_manager(cluster_manager: str):
|
31 |
jl.seval(f"using ClusterManagers: addprocs_{cluster_manager}")
|
32 |
return jl.seval(f"addprocs_{cluster_manager}")
|
33 |
|
34 |
|
35 |
+
def jl_array(x, dtype=None):
|
36 |
if x is None:
|
37 |
return None
|
38 |
+
elif dtype is None:
|
39 |
+
return jl_convert(jl.Array, x)
|
40 |
+
else:
|
41 |
+
return jl_convert(jl.Array[dtype], x)
|
42 |
+
|
43 |
+
|
44 |
+
def jl_is_function(f) -> bool:
|
45 |
+
return cast(bool, jl.seval("op -> op isa Function")(f))
|
46 |
|
47 |
|
48 |
+
def jl_serialize(obj: Any) -> NDArray[np.uint8]:
|
49 |
buf = jl.IOBuffer()
|
50 |
Serialization.serialize(buf, obj)
|
51 |
return np.array(jl.take_b(buf))
|
52 |
|
53 |
|
54 |
+
def jl_deserialize(s: Union[NDArray[np.uint8], None]):
|
55 |
if s is None:
|
56 |
return s
|
57 |
buf = jl.IOBuffer()
|
pysr/julia_import.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
import warnings
|
|
|
|
|
4 |
|
5 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
6 |
# about the relevant environment variables. If not loaded,
|
@@ -42,6 +44,9 @@ if autoload_extensions is not None:
|
|
42 |
|
43 |
from juliacall import Main as jl # type: ignore
|
44 |
|
|
|
|
|
|
|
45 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
46 |
|
47 |
jl.seval("using SymbolicRegression")
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
import warnings
|
4 |
+
from types import ModuleType
|
5 |
+
from typing import cast
|
6 |
|
7 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
8 |
# about the relevant environment variables. If not loaded,
|
|
|
44 |
|
45 |
from juliacall import Main as jl # type: ignore
|
46 |
|
47 |
+
jl = cast(ModuleType, jl)
|
48 |
+
|
49 |
+
|
50 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
51 |
|
52 |
jl.seval("using SymbolicRegression")
|
pysr/sr.py
CHANGED
@@ -8,27 +8,31 @@ import shutil
|
|
8 |
import sys
|
9 |
import tempfile
|
10 |
import warnings
|
|
|
11 |
from datetime import datetime
|
12 |
from io import StringIO
|
13 |
from multiprocessing import cpu_count
|
14 |
from pathlib import Path
|
15 |
-
from typing import Callable, Dict, List, Optional, Tuple, Union
|
16 |
-
|
17 |
-
if sys.version_info >= (3, 8):
|
18 |
-
from typing import Literal
|
19 |
-
else:
|
20 |
-
from typing_extensions import Literal
|
21 |
|
22 |
import numpy as np
|
23 |
import pandas as pd
|
|
|
|
|
24 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
25 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
26 |
-
from sklearn.utils.validation import _check_feature_names_in
|
|
|
27 |
|
28 |
from .denoising import denoise, multi_denoise
|
29 |
from .deprecated import DEPRECATED_KWARGS
|
30 |
from .export_jax import sympy2jax
|
31 |
-
from .export_latex import
|
|
|
|
|
|
|
|
|
|
|
32 |
from .export_numpy import sympy2numpy
|
33 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
34 |
from .export_torch import sympy2torch
|
@@ -40,17 +44,20 @@ from .julia_helpers import (
|
|
40 |
_load_cluster_manager,
|
41 |
jl_array,
|
42 |
jl_deserialize,
|
|
|
43 |
jl_serialize,
|
44 |
)
|
45 |
from .julia_import import SymbolicRegression, jl
|
46 |
from .utils import (
|
|
|
|
|
47 |
_csv_filename_to_pkl_filename,
|
48 |
_preprocess_julia_floats,
|
49 |
_safe_check_feature_names_in,
|
50 |
_subscriptify,
|
51 |
)
|
52 |
|
53 |
-
|
54 |
|
55 |
|
56 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
@@ -178,6 +185,21 @@ def _check_assertions(
|
|
178 |
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
179 |
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
182 |
"""
|
183 |
High-performance symbolic regression algorithm.
|
@@ -606,22 +628,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
606 |
Units of each variable in the training dataset, `y`.
|
607 |
nout_ : int
|
608 |
Number of output dimensions.
|
609 |
-
selection_mask_ :
|
610 |
-
|
611 |
-
`select_k_features` is set.
|
612 |
tempdir_ : Path
|
613 |
Path to the temporary equations directory.
|
614 |
-
equation_file_ : str
|
615 |
Output equation file name produced by the julia backend.
|
616 |
julia_state_stream_ : ndarray
|
617 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
618 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
619 |
-
julia_state_
|
620 |
-
The deserialized state.
|
621 |
julia_options_stream_ : ndarray
|
622 |
The serialized julia options, stored as an array of uint8,
|
623 |
-
julia_options_
|
624 |
-
The deserialized julia options.
|
625 |
equation_file_contents_ : list[pandas.DataFrame]
|
626 |
Contents of the equation file output by the Julia backend.
|
627 |
show_pickle_warnings_ : bool
|
@@ -668,6 +685,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
668 |
```
|
669 |
"""
|
670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
671 |
def __init__(
|
672 |
self,
|
673 |
model_selection: Literal["best", "accuracy", "score"] = "best",
|
@@ -900,14 +932,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
900 |
@classmethod
|
901 |
def from_file(
|
902 |
cls,
|
903 |
-
equation_file,
|
904 |
*,
|
905 |
-
binary_operators=None,
|
906 |
-
unary_operators=None,
|
907 |
-
n_features_in=None,
|
908 |
-
feature_names_in=None,
|
909 |
-
selection_mask=None,
|
910 |
-
nout=1,
|
911 |
**pysr_kwargs,
|
912 |
):
|
913 |
"""
|
@@ -915,7 +947,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
915 |
|
916 |
Parameters
|
917 |
----------
|
918 |
-
equation_file : str
|
919 |
Path to a pickle file containing a saved model, or a csv file
|
920 |
containing equations.
|
921 |
binary_operators : list[str]
|
@@ -930,8 +962,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
930 |
feature_names_in : list[str]
|
931 |
Names of the features passed to the model.
|
932 |
Not needed if loading from a pickle file.
|
933 |
-
selection_mask :
|
934 |
-
If using select_k_features
|
935 |
Not needed if loading from a pickle file.
|
936 |
nout : int
|
937 |
Number of outputs of the model.
|
@@ -982,7 +1014,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
982 |
|
983 |
# TODO: copy .bkup file if exists.
|
984 |
model = cls(
|
985 |
-
equation_file=equation_file,
|
986 |
binary_operators=binary_operators,
|
987 |
unary_operators=unary_operators,
|
988 |
**pysr_kwargs,
|
@@ -1002,7 +1034,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1002 |
model.display_feature_names_in_ = feature_names_in
|
1003 |
|
1004 |
if selection_mask is None:
|
1005 |
-
model.selection_mask_ = np.ones(n_features_in, dtype=
|
1006 |
else:
|
1007 |
model.selection_mask_ = selection_mask
|
1008 |
|
@@ -1029,7 +1061,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1029 |
all_equations = equations
|
1030 |
|
1031 |
for i, equations in enumerate(all_equations):
|
1032 |
-
selected = [""
|
1033 |
chosen_row = idx_model_selection(equations, self.model_selection)
|
1034 |
selected[chosen_row] = ">>>>"
|
1035 |
repr_equations = pd.DataFrame(
|
@@ -1129,10 +1161,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1129 |
|
1130 |
@property
|
1131 |
def julia_options_(self):
|
|
|
1132 |
return jl_deserialize(self.julia_options_stream_)
|
1133 |
|
1134 |
@property
|
1135 |
def julia_state_(self):
|
|
|
1136 |
return jl_deserialize(self.julia_state_stream_)
|
1137 |
|
1138 |
@property
|
@@ -1145,7 +1179,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1145 |
)
|
1146 |
return self.julia_state_
|
1147 |
|
1148 |
-
def get_best(self, index=None):
|
1149 |
"""
|
1150 |
Get best equation using `model_selection`.
|
1151 |
|
@@ -1168,8 +1202,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1168 |
Raised when an invalid model selection strategy is provided.
|
1169 |
"""
|
1170 |
check_is_fitted(self, attributes=["equations_"])
|
1171 |
-
if self.equations_ is None:
|
1172 |
-
raise ValueError("No equations have been generated yet.")
|
1173 |
|
1174 |
if index is not None:
|
1175 |
if isinstance(self.equations_, list):
|
@@ -1177,16 +1209,21 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1177 |
index, list
|
1178 |
), "With multiple output features, index must be a list."
|
1179 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1180 |
-
|
|
|
|
|
1181 |
|
1182 |
if isinstance(self.equations_, list):
|
1183 |
return [
|
1184 |
-
eq.
|
1185 |
for eq in self.equations_
|
1186 |
]
|
1187 |
-
|
1188 |
-
|
1189 |
-
|
|
|
|
|
|
|
1190 |
|
1191 |
def _setup_equation_file(self):
|
1192 |
"""
|
@@ -1211,7 +1248,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1211 |
self.equation_file_ = self.equation_file
|
1212 |
self.equation_file_contents_ = None
|
1213 |
|
1214 |
-
def
|
1215 |
"""
|
1216 |
Ensure parameters passed at initialization are valid.
|
1217 |
|
@@ -1269,59 +1306,48 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1269 |
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
1270 |
)
|
1271 |
|
1272 |
-
|
1273 |
-
|
1274 |
-
|
1275 |
-
|
1276 |
-
|
1277 |
-
|
1278 |
-
|
1279 |
-
|
1280 |
-
|
1281 |
-
|
1282 |
-
|
1283 |
-
|
1284 |
-
|
1285 |
-
|
1286 |
-
|
1287 |
-
|
1288 |
-
|
1289 |
-
parameter_value = default_value
|
1290 |
else:
|
1291 |
-
#
|
1292 |
-
|
1293 |
-
|
1294 |
-
)
|
1295 |
-
|
1296 |
-
elif parameter == "batch_size" and parameter_value < 1:
|
1297 |
-
warnings.warn(
|
1298 |
-
"Given `batch_size` must be greater than or equal to one. "
|
1299 |
-
"`batch_size` has been increased to equal one."
|
1300 |
-
)
|
1301 |
-
parameter_value = 1
|
1302 |
-
elif (
|
1303 |
-
parameter == "progress"
|
1304 |
-
and parameter_value
|
1305 |
-
and "buffer" not in sys.stdout.__dir__()
|
1306 |
-
):
|
1307 |
-
warnings.warn(
|
1308 |
-
"Note: it looks like you are running in Jupyter. "
|
1309 |
-
"The progress bar will be turned off."
|
1310 |
-
)
|
1311 |
-
parameter_value = False
|
1312 |
-
packed_modified_params[parameter] = parameter_value
|
1313 |
|
1314 |
assert (
|
1315 |
-
len(
|
1316 |
-
|
1317 |
-
|
1318 |
-
)
|
1319 |
|
1320 |
-
return
|
1321 |
|
1322 |
def _validate_and_set_fit_params(
|
1323 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
1324 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1325 |
"""
|
1326 |
Validate the parameters passed to the :term`fit` method.
|
1327 |
|
@@ -1341,7 +1367,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1341 |
Weight array of the same shape as `y`.
|
1342 |
Each element is how to weight the mean-square-error loss
|
1343 |
for that particular element of y.
|
1344 |
-
variable_names :
|
1345 |
Names of each variable in the training dataset, `X`.
|
1346 |
X_units : list[str] of length n_features
|
1347 |
Units of each variable in the training dataset, `X`.
|
@@ -1397,7 +1423,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1397 |
if weights is not None:
|
1398 |
weights = check_array(weights, ensure_2d=False)
|
1399 |
check_consistent_length(weights, y)
|
1400 |
-
X, y = self.
|
1401 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
1402 |
self, variable_names, generate_names=False
|
1403 |
)
|
@@ -1407,10 +1433,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1407 |
self.display_feature_names_in_ = np.array(
|
1408 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1409 |
)
|
|
|
1410 |
else:
|
1411 |
self.display_feature_names_in_ = self.feature_names_in_
|
1412 |
-
|
1413 |
-
variable_names = self.feature_names_in_
|
1414 |
|
1415 |
# Handle multioutput data
|
1416 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
@@ -1425,8 +1451,23 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1425 |
|
1426 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1428 |
def _pre_transform_training_data(
|
1429 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1430 |
):
|
1431 |
"""
|
1432 |
Transform the training data before fitting the symbolic regressor.
|
@@ -1435,12 +1476,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1435 |
|
1436 |
Parameters
|
1437 |
----------
|
1438 |
-
X : ndarray
|
1439 |
Training data of shape (n_samples, n_features).
|
1440 |
-
y : ndarray
|
1441 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
1442 |
Will be cast to X's dtype if necessary.
|
1443 |
-
Xresampled : ndarray |
|
1444 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
1445 |
used for denoising.
|
1446 |
variable_names : list[str]
|
@@ -1478,24 +1519,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1478 |
"""
|
1479 |
# Feature selection transformation
|
1480 |
if self.select_k_features:
|
1481 |
-
|
1482 |
X, y, self.select_k_features, random_state=random_state
|
1483 |
)
|
1484 |
-
X = X[:,
|
1485 |
|
1486 |
if Xresampled is not None:
|
1487 |
-
Xresampled = Xresampled[:,
|
1488 |
|
1489 |
# Reduce variable_names to selection
|
1490 |
-
variable_names =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1491 |
|
1492 |
if X_units is not None:
|
1493 |
-
X_units =
|
|
|
|
|
|
|
1494 |
self.X_units_ = copy.deepcopy(X_units)
|
1495 |
|
1496 |
# Re-perform data validation and feature name updating
|
1497 |
-
X, y = self.
|
1498 |
# Update feature names with selected variable names
|
|
|
1499 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1500 |
self.display_feature_names_in_ = self.feature_names_in_
|
1501 |
print(f"Using features {self.feature_names_in_}")
|
@@ -1511,20 +1563,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1511 |
|
1512 |
return X, y, variable_names, X_units, y_units
|
1513 |
|
1514 |
-
def _run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1515 |
"""
|
1516 |
Run the symbolic regression fitting process on the julia backend.
|
1517 |
|
1518 |
Parameters
|
1519 |
----------
|
1520 |
-
X : ndarray
|
1521 |
Training data of shape `(n_samples, n_features)`.
|
1522 |
-
y : ndarray
|
1523 |
Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
|
1524 |
Will be cast to `X`'s dtype if necessary.
|
1525 |
-
|
1526 |
-
|
1527 |
-
weights : ndarray |
|
1528 |
Weight array of the same shape as `y`.
|
1529 |
Each element is how to weight the mean-square-error loss
|
1530 |
for that particular element of y.
|
@@ -1543,24 +1602,26 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1543 |
"""
|
1544 |
# Need to be global as we don't want to recreate/reinstate julia for
|
1545 |
# every new instance of PySRRegressor
|
1546 |
-
global
|
1547 |
|
1548 |
# These are the parameters which may be modified from the ones
|
1549 |
# specified in init, so we define them here locally:
|
1550 |
-
binary_operators =
|
1551 |
-
unary_operators =
|
1552 |
-
maxdepth =
|
1553 |
-
constraints =
|
|
|
|
|
|
|
|
|
|
|
|
|
1554 |
nested_constraints = self.nested_constraints
|
1555 |
complexity_of_operators = self.complexity_of_operators
|
1556 |
-
multithreading = mutated_params["multithreading"]
|
1557 |
cluster_manager = self.cluster_manager
|
1558 |
-
batch_size = mutated_params["batch_size"]
|
1559 |
-
update_verbosity = mutated_params["update_verbosity"]
|
1560 |
-
progress = mutated_params["progress"]
|
1561 |
|
1562 |
# Start julia backend processes
|
1563 |
-
if not
|
1564 |
print("Compiling Julia backend...")
|
1565 |
|
1566 |
if cluster_manager is not None:
|
@@ -1599,6 +1660,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1599 |
complexity_of_operators_str += f"({k}) => {v}, "
|
1600 |
complexity_of_operators_str += ")"
|
1601 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
|
|
1602 |
|
1603 |
custom_loss = jl.seval(
|
1604 |
str(self.elementwise_loss)
|
@@ -1635,11 +1697,25 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1635 |
optimize=self.weight_optimize,
|
1636 |
)
|
1637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1638 |
# Call to Julia backend.
|
1639 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
1640 |
options = SymbolicRegression.Options(
|
1641 |
-
binary_operators=
|
1642 |
-
unary_operators=
|
1643 |
bin_constraints=jl_array(bin_constraints),
|
1644 |
una_constraints=jl_array(una_constraints),
|
1645 |
complexity_of_operators=complexity_of_operators,
|
@@ -1671,9 +1747,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1671 |
fraction_replaced_hof=self.fraction_replaced_hof,
|
1672 |
should_simplify=self.should_simplify,
|
1673 |
should_optimize_constants=self.should_optimize_constants,
|
1674 |
-
warmup_maxsize_by=
|
1675 |
-
0.0 if self.warmup_maxsize_by is None else self.warmup_maxsize_by
|
1676 |
-
),
|
1677 |
use_frequency=self.use_frequency,
|
1678 |
use_frequency_in_tournament=self.use_frequency_in_tournament,
|
1679 |
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
|
@@ -1780,7 +1854,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1780 |
if self.delete_tempfiles:
|
1781 |
shutil.rmtree(self.tempdir_)
|
1782 |
|
1783 |
-
|
1784 |
|
1785 |
return self
|
1786 |
|
@@ -1790,9 +1864,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1790 |
y,
|
1791 |
Xresampled=None,
|
1792 |
weights=None,
|
1793 |
-
variable_names: Optional[
|
1794 |
-
X_units: Optional[
|
1795 |
-
y_units: Optional[
|
1796 |
) -> "PySRRegressor":
|
1797 |
"""
|
1798 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
@@ -1854,12 +1928,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1854 |
self.X_units_ = None
|
1855 |
self.y_units_ = None
|
1856 |
|
1857 |
-
random_state = check_random_state(self.random_state) # For np random
|
1858 |
-
seed = random_state.get_state()[1][0] # For julia random
|
1859 |
-
|
1860 |
self._setup_equation_file()
|
1861 |
|
1862 |
-
|
1863 |
|
1864 |
(
|
1865 |
X,
|
@@ -1884,6 +1955,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1884 |
"More datapoints will lower the search speed."
|
1885 |
)
|
1886 |
|
|
|
|
|
|
|
1887 |
# Pre transformations (feature selection and denoising)
|
1888 |
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
1889 |
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
@@ -1925,7 +1999,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1925 |
self._checkpoint()
|
1926 |
|
1927 |
# Perform the search:
|
1928 |
-
self._run(X, y,
|
1929 |
|
1930 |
# Then, after fit, we save again, so the pickle file contains
|
1931 |
# the equations:
|
@@ -1934,7 +2008,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1934 |
|
1935 |
return self
|
1936 |
|
1937 |
-
def refresh(self, checkpoint_file=None):
|
1938 |
"""
|
1939 |
Update self.equations_ with any new options passed.
|
1940 |
|
@@ -1943,11 +2017,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1943 |
|
1944 |
Parameters
|
1945 |
----------
|
1946 |
-
checkpoint_file : str
|
1947 |
Path to checkpoint hall of fame file to be loaded.
|
1948 |
The default will use the set `equation_file_`.
|
1949 |
"""
|
1950 |
-
if checkpoint_file:
|
1951 |
self.equation_file_ = checkpoint_file
|
1952 |
self.equation_file_contents_ = None
|
1953 |
check_is_fitted(self, attributes=["equation_file_"])
|
@@ -1999,7 +2073,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1999 |
if self.selection_mask_ is not None:
|
2000 |
# RangeIndex enforces column order allowing columns to
|
2001 |
# be correctly filtered with self.selection_mask_
|
2002 |
-
X = X.
|
2003 |
X.columns = self.feature_names_in_
|
2004 |
# Without feature information, CallableEquation/lambda_format equations
|
2005 |
# require that the column order of X matches that of the X used during
|
@@ -2009,14 +2083,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2009 |
# reordered/reindexed to match those of the transformed (denoised and
|
2010 |
# feature selected) X in fit.
|
2011 |
X = X.reindex(columns=self.feature_names_in_)
|
2012 |
-
X = self.
|
2013 |
|
2014 |
try:
|
2015 |
-
if
|
|
|
2016 |
return np.stack(
|
2017 |
[eq["lambda_format"](X) for eq in best_equation], axis=1
|
2018 |
)
|
2019 |
-
|
|
|
2020 |
except Exception as error:
|
2021 |
raise ValueError(
|
2022 |
"Failed to evaluate the expression. "
|
@@ -2046,9 +2122,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2046 |
"""
|
2047 |
self.refresh()
|
2048 |
best_equation = self.get_best(index=index)
|
2049 |
-
if
|
|
|
2050 |
return [eq["sympy_format"] for eq in best_equation]
|
2051 |
-
|
|
|
2052 |
|
2053 |
def latex(self, index=None, precision=3):
|
2054 |
"""
|
@@ -2108,9 +2186,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2108 |
self.set_params(output_jax_format=True)
|
2109 |
self.refresh()
|
2110 |
best_equation = self.get_best(index=index)
|
2111 |
-
if
|
|
|
2112 |
return [eq["jax_format"] for eq in best_equation]
|
2113 |
-
|
|
|
2114 |
|
2115 |
def pytorch(self, index=None):
|
2116 |
"""
|
@@ -2138,9 +2218,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2138 |
self.set_params(output_torch_format=True)
|
2139 |
self.refresh()
|
2140 |
best_equation = self.get_best(index=index)
|
2141 |
-
if
|
2142 |
return [eq["torch_format"] for eq in best_equation]
|
2143 |
-
|
|
|
2144 |
|
2145 |
def _read_equation_file(self):
|
2146 |
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
|
@@ -2239,10 +2320,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2239 |
lastComplexity = 0
|
2240 |
sympy_format = []
|
2241 |
lambda_format = []
|
2242 |
-
|
2243 |
-
|
2244 |
-
if self.output_torch_format:
|
2245 |
-
torch_format = []
|
2246 |
|
2247 |
for _, eqn_row in output.iterrows():
|
2248 |
eqn = pysr2sympy(
|
@@ -2354,7 +2433,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2354 |
"""
|
2355 |
self.refresh()
|
2356 |
|
2357 |
-
if self.
|
2358 |
if indices is not None:
|
2359 |
assert isinstance(indices, list)
|
2360 |
assert isinstance(indices[0], list)
|
@@ -2363,7 +2442,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2363 |
table_string = sympy2multilatextable(
|
2364 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2365 |
)
|
2366 |
-
|
2367 |
if indices is not None:
|
2368 |
assert isinstance(indices, list)
|
2369 |
assert isinstance(indices[0], int)
|
@@ -2371,15 +2450,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2371 |
table_string = sympy2latextable(
|
2372 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2373 |
)
|
|
|
|
|
|
|
|
|
|
|
2374 |
|
2375 |
-
|
2376 |
-
r"\usepackage{breqn}",
|
2377 |
-
r"\usepackage{booktabs}",
|
2378 |
-
"",
|
2379 |
-
"...",
|
2380 |
-
"",
|
2381 |
-
]
|
2382 |
-
return "\n".join(preamble_string + [table_string])
|
2383 |
|
2384 |
|
2385 |
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
@@ -2397,3 +2474,30 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
2397 |
f"{model_selection} is not a valid model selection strategy."
|
2398 |
)
|
2399 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import sys
|
9 |
import tempfile
|
10 |
import warnings
|
11 |
+
from dataclasses import dataclass, fields
|
12 |
from datetime import datetime
|
13 |
from io import StringIO
|
14 |
from multiprocessing import cpu_count
|
15 |
from pathlib import Path
|
16 |
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
import numpy as np
|
19 |
import pandas as pd
|
20 |
+
from numpy import ndarray
|
21 |
+
from numpy.typing import NDArray
|
22 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
23 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
24 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
25 |
+
from sklearn.utils.validation import check_is_fitted
|
26 |
|
27 |
from .denoising import denoise, multi_denoise
|
28 |
from .deprecated import DEPRECATED_KWARGS
|
29 |
from .export_jax import sympy2jax
|
30 |
+
from .export_latex import (
|
31 |
+
sympy2latex,
|
32 |
+
sympy2latextable,
|
33 |
+
sympy2multilatextable,
|
34 |
+
with_preamble,
|
35 |
+
)
|
36 |
from .export_numpy import sympy2numpy
|
37 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
38 |
from .export_torch import sympy2torch
|
|
|
44 |
_load_cluster_manager,
|
45 |
jl_array,
|
46 |
jl_deserialize,
|
47 |
+
jl_is_function,
|
48 |
jl_serialize,
|
49 |
)
|
50 |
from .julia_import import SymbolicRegression, jl
|
51 |
from .utils import (
|
52 |
+
ArrayLike,
|
53 |
+
PathLike,
|
54 |
_csv_filename_to_pkl_filename,
|
55 |
_preprocess_julia_floats,
|
56 |
_safe_check_feature_names_in,
|
57 |
_subscriptify,
|
58 |
)
|
59 |
|
60 |
+
ALREADY_RAN = False
|
61 |
|
62 |
|
63 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
|
|
185 |
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
186 |
|
187 |
|
188 |
+
@dataclass
|
189 |
+
class _DynamicallySetParams:
|
190 |
+
"""Defines some parameters that are set at runtime."""
|
191 |
+
|
192 |
+
binary_operators: List[str]
|
193 |
+
unary_operators: List[str]
|
194 |
+
maxdepth: int
|
195 |
+
constraints: Dict[str, str]
|
196 |
+
multithreading: bool
|
197 |
+
batch_size: int
|
198 |
+
update_verbosity: int
|
199 |
+
progress: bool
|
200 |
+
warmup_maxsize_by: float
|
201 |
+
|
202 |
+
|
203 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
204 |
"""
|
205 |
High-performance symbolic regression algorithm.
|
|
|
628 |
Units of each variable in the training dataset, `y`.
|
629 |
nout_ : int
|
630 |
Number of output dimensions.
|
631 |
+
selection_mask_ : ndarray of shape (`n_features_in_`,)
|
632 |
+
Mask of which features of `X` to use when `select_k_features` is set.
|
|
|
633 |
tempdir_ : Path
|
634 |
Path to the temporary equations directory.
|
635 |
+
equation_file_ : Union[str, Path]
|
636 |
Output equation file name produced by the julia backend.
|
637 |
julia_state_stream_ : ndarray
|
638 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
639 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
|
|
|
|
640 |
julia_options_stream_ : ndarray
|
641 |
The serialized julia options, stored as an array of uint8,
|
|
|
|
|
642 |
equation_file_contents_ : list[pandas.DataFrame]
|
643 |
Contents of the equation file output by the Julia backend.
|
644 |
show_pickle_warnings_ : bool
|
|
|
685 |
```
|
686 |
"""
|
687 |
|
688 |
+
equations_: Union[pd.DataFrame, List[pd.DataFrame], None]
|
689 |
+
n_features_in_: int
|
690 |
+
feature_names_in_: ArrayLike[str]
|
691 |
+
display_feature_names_in_: ArrayLike[str]
|
692 |
+
X_units_: Union[ArrayLike[str], None]
|
693 |
+
y_units_: Union[str, ArrayLike[str], None]
|
694 |
+
nout_: int
|
695 |
+
selection_mask_: Union[NDArray[np.bool_], None]
|
696 |
+
tempdir_: Path
|
697 |
+
equation_file_: PathLike
|
698 |
+
julia_state_stream_: Union[NDArray[np.uint8], None]
|
699 |
+
julia_options_stream_: Union[NDArray[np.uint8], None]
|
700 |
+
equation_file_contents_: Union[List[pd.DataFrame], None]
|
701 |
+
show_pickle_warnings_: bool
|
702 |
+
|
703 |
def __init__(
|
704 |
self,
|
705 |
model_selection: Literal["best", "accuracy", "score"] = "best",
|
|
|
932 |
@classmethod
|
933 |
def from_file(
|
934 |
cls,
|
935 |
+
equation_file: PathLike,
|
936 |
*,
|
937 |
+
binary_operators: Optional[List[str]] = None,
|
938 |
+
unary_operators: Optional[List[str]] = None,
|
939 |
+
n_features_in: Optional[int] = None,
|
940 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
941 |
+
selection_mask: Optional[NDArray[np.bool_]] = None,
|
942 |
+
nout: int = 1,
|
943 |
**pysr_kwargs,
|
944 |
):
|
945 |
"""
|
|
|
947 |
|
948 |
Parameters
|
949 |
----------
|
950 |
+
equation_file : str or Path
|
951 |
Path to a pickle file containing a saved model, or a csv file
|
952 |
containing equations.
|
953 |
binary_operators : list[str]
|
|
|
962 |
feature_names_in : list[str]
|
963 |
Names of the features passed to the model.
|
964 |
Not needed if loading from a pickle file.
|
965 |
+
selection_mask : NDArray[np.bool_]
|
966 |
+
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
967 |
Not needed if loading from a pickle file.
|
968 |
nout : int
|
969 |
Number of outputs of the model.
|
|
|
1014 |
|
1015 |
# TODO: copy .bkup file if exists.
|
1016 |
model = cls(
|
1017 |
+
equation_file=str(equation_file),
|
1018 |
binary_operators=binary_operators,
|
1019 |
unary_operators=unary_operators,
|
1020 |
**pysr_kwargs,
|
|
|
1034 |
model.display_feature_names_in_ = feature_names_in
|
1035 |
|
1036 |
if selection_mask is None:
|
1037 |
+
model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
|
1038 |
else:
|
1039 |
model.selection_mask_ = selection_mask
|
1040 |
|
|
|
1061 |
all_equations = equations
|
1062 |
|
1063 |
for i, equations in enumerate(all_equations):
|
1064 |
+
selected = pd.Series([""] * len(equations), index=equations.index)
|
1065 |
chosen_row = idx_model_selection(equations, self.model_selection)
|
1066 |
selected[chosen_row] = ">>>>"
|
1067 |
repr_equations = pd.DataFrame(
|
|
|
1161 |
|
1162 |
@property
|
1163 |
def julia_options_(self):
|
1164 |
+
"""The deserialized julia options."""
|
1165 |
return jl_deserialize(self.julia_options_stream_)
|
1166 |
|
1167 |
@property
|
1168 |
def julia_state_(self):
|
1169 |
+
"""The deserialized state."""
|
1170 |
return jl_deserialize(self.julia_state_stream_)
|
1171 |
|
1172 |
@property
|
|
|
1179 |
)
|
1180 |
return self.julia_state_
|
1181 |
|
1182 |
+
def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
|
1183 |
"""
|
1184 |
Get best equation using `model_selection`.
|
1185 |
|
|
|
1202 |
Raised when an invalid model selection strategy is provided.
|
1203 |
"""
|
1204 |
check_is_fitted(self, attributes=["equations_"])
|
|
|
|
|
1205 |
|
1206 |
if index is not None:
|
1207 |
if isinstance(self.equations_, list):
|
|
|
1209 |
index, list
|
1210 |
), "With multiple output features, index must be a list."
|
1211 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
1212 |
+
else:
|
1213 |
+
equations_ = cast(pd.DataFrame, self.equations_)
|
1214 |
+
return cast(pd.Series, equations_.iloc[index])
|
1215 |
|
1216 |
if isinstance(self.equations_, list):
|
1217 |
return [
|
1218 |
+
cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
|
1219 |
for eq in self.equations_
|
1220 |
]
|
1221 |
+
else:
|
1222 |
+
equations_ = cast(pd.DataFrame, self.equations_)
|
1223 |
+
return cast(
|
1224 |
+
pd.Series,
|
1225 |
+
equations_.loc[idx_model_selection(equations_, self.model_selection)],
|
1226 |
+
)
|
1227 |
|
1228 |
def _setup_equation_file(self):
|
1229 |
"""
|
|
|
1248 |
self.equation_file_ = self.equation_file
|
1249 |
self.equation_file_contents_ = None
|
1250 |
|
1251 |
+
def _validate_and_modify_params(self) -> _DynamicallySetParams:
|
1252 |
"""
|
1253 |
Ensure parameters passed at initialization are valid.
|
1254 |
|
|
|
1306 |
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
1307 |
)
|
1308 |
|
1309 |
+
param_container = _DynamicallySetParams(
|
1310 |
+
binary_operators=["+", "*", "-", "/"],
|
1311 |
+
unary_operators=[],
|
1312 |
+
maxdepth=self.maxsize,
|
1313 |
+
constraints={},
|
1314 |
+
multithreading=self.procs != 0 and self.cluster_manager is None,
|
1315 |
+
batch_size=1,
|
1316 |
+
update_verbosity=int(self.verbosity),
|
1317 |
+
progress=self.progress,
|
1318 |
+
warmup_maxsize_by=0.0,
|
1319 |
+
)
|
1320 |
+
|
1321 |
+
for param_name in map(lambda x: x.name, fields(_DynamicallySetParams)):
|
1322 |
+
user_param_value = getattr(self, param_name)
|
1323 |
+
if user_param_value is None:
|
1324 |
+
# Leave as the default in DynamicallySetParams
|
1325 |
+
...
|
|
|
1326 |
else:
|
1327 |
+
# If user has specified it, we will override the default.
|
1328 |
+
# However, there are some special cases to mutate it:
|
1329 |
+
new_param_value = _mutate_parameter(param_name, user_param_value)
|
1330 |
+
setattr(param_container, param_name, new_param_value)
|
1331 |
+
# TODO: This should just be part of the __init__ of _DynamicallySetParams
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1332 |
|
1333 |
assert (
|
1334 |
+
len(param_container.binary_operators) > 0
|
1335 |
+
or len(param_container.unary_operators) > 0
|
1336 |
+
), "At least one operator must be provided."
|
|
|
1337 |
|
1338 |
+
return param_container
|
1339 |
|
1340 |
def _validate_and_set_fit_params(
|
1341 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
1342 |
+
) -> Tuple[
|
1343 |
+
ndarray,
|
1344 |
+
ndarray,
|
1345 |
+
Optional[ndarray],
|
1346 |
+
Optional[ndarray],
|
1347 |
+
ArrayLike[str],
|
1348 |
+
Optional[ArrayLike[str]],
|
1349 |
+
Optional[Union[str, ArrayLike[str]]],
|
1350 |
+
]:
|
1351 |
"""
|
1352 |
Validate the parameters passed to the :term`fit` method.
|
1353 |
|
|
|
1367 |
Weight array of the same shape as `y`.
|
1368 |
Each element is how to weight the mean-square-error loss
|
1369 |
for that particular element of y.
|
1370 |
+
variable_names : ndarray of length n_features
|
1371 |
Names of each variable in the training dataset, `X`.
|
1372 |
X_units : list[str] of length n_features
|
1373 |
Units of each variable in the training dataset, `X`.
|
|
|
1423 |
if weights is not None:
|
1424 |
weights = check_array(weights, ensure_2d=False)
|
1425 |
check_consistent_length(weights, y)
|
1426 |
+
X, y = self._validate_data_X_y(X, y)
|
1427 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
1428 |
self, variable_names, generate_names=False
|
1429 |
)
|
|
|
1433 |
self.display_feature_names_in_ = np.array(
|
1434 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1435 |
)
|
1436 |
+
variable_names = self.feature_names_in_
|
1437 |
else:
|
1438 |
self.display_feature_names_in_ = self.feature_names_in_
|
1439 |
+
variable_names = self.feature_names_in_
|
|
|
1440 |
|
1441 |
# Handle multioutput data
|
1442 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
|
1451 |
|
1452 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
1453 |
|
1454 |
+
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
1455 |
+
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
1456 |
+
return cast(Tuple[ndarray, ndarray], raw_out)
|
1457 |
+
|
1458 |
+
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
1459 |
+
raw_out = self._validate_data(X=X, reset=False) # type: ignore
|
1460 |
+
return cast(Tuple[ndarray], raw_out)
|
1461 |
+
|
1462 |
def _pre_transform_training_data(
|
1463 |
+
self,
|
1464 |
+
X: ndarray,
|
1465 |
+
y: ndarray,
|
1466 |
+
Xresampled: Union[ndarray, None],
|
1467 |
+
variable_names: ArrayLike[str],
|
1468 |
+
X_units: Union[ArrayLike[str], None],
|
1469 |
+
y_units: Union[ArrayLike[str], str, None],
|
1470 |
+
random_state: np.random.RandomState,
|
1471 |
):
|
1472 |
"""
|
1473 |
Transform the training data before fitting the symbolic regressor.
|
|
|
1476 |
|
1477 |
Parameters
|
1478 |
----------
|
1479 |
+
X : ndarray
|
1480 |
Training data of shape (n_samples, n_features).
|
1481 |
+
y : ndarray
|
1482 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
1483 |
Will be cast to X's dtype if necessary.
|
1484 |
+
Xresampled : ndarray | None
|
1485 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
1486 |
used for denoising.
|
1487 |
variable_names : list[str]
|
|
|
1519 |
"""
|
1520 |
# Feature selection transformation
|
1521 |
if self.select_k_features:
|
1522 |
+
selection_mask = run_feature_selection(
|
1523 |
X, y, self.select_k_features, random_state=random_state
|
1524 |
)
|
1525 |
+
X = X[:, selection_mask]
|
1526 |
|
1527 |
if Xresampled is not None:
|
1528 |
+
Xresampled = Xresampled[:, selection_mask]
|
1529 |
|
1530 |
# Reduce variable_names to selection
|
1531 |
+
variable_names = cast(
|
1532 |
+
ArrayLike[str],
|
1533 |
+
[
|
1534 |
+
variable_names[i]
|
1535 |
+
for i in range(len(variable_names))
|
1536 |
+
if selection_mask[i]
|
1537 |
+
],
|
1538 |
+
)
|
1539 |
|
1540 |
if X_units is not None:
|
1541 |
+
X_units = cast(
|
1542 |
+
ArrayLike[str],
|
1543 |
+
[X_units[i] for i in range(len(X_units)) if selection_mask[i]],
|
1544 |
+
)
|
1545 |
self.X_units_ = copy.deepcopy(X_units)
|
1546 |
|
1547 |
# Re-perform data validation and feature name updating
|
1548 |
+
X, y = self._validate_data_X_y(X, y)
|
1549 |
# Update feature names with selected variable names
|
1550 |
+
self.selection_mask_ = selection_mask
|
1551 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
1552 |
self.display_feature_names_in_ = self.feature_names_in_
|
1553 |
print(f"Using features {self.feature_names_in_}")
|
|
|
1563 |
|
1564 |
return X, y, variable_names, X_units, y_units
|
1565 |
|
1566 |
+
def _run(
|
1567 |
+
self,
|
1568 |
+
X: ndarray,
|
1569 |
+
y: ndarray,
|
1570 |
+
runtime_params: _DynamicallySetParams,
|
1571 |
+
weights: Optional[ndarray],
|
1572 |
+
seed: int,
|
1573 |
+
):
|
1574 |
"""
|
1575 |
Run the symbolic regression fitting process on the julia backend.
|
1576 |
|
1577 |
Parameters
|
1578 |
----------
|
1579 |
+
X : ndarray
|
1580 |
Training data of shape `(n_samples, n_features)`.
|
1581 |
+
y : ndarray
|
1582 |
Target values of shape `(n_samples,)` or `(n_samples, n_targets)`.
|
1583 |
Will be cast to `X`'s dtype if necessary.
|
1584 |
+
runtime_params : DynamicallySetParams
|
1585 |
+
Dynamically set versions of some parameters passed in __init__.
|
1586 |
+
weights : ndarray | None
|
1587 |
Weight array of the same shape as `y`.
|
1588 |
Each element is how to weight the mean-square-error loss
|
1589 |
for that particular element of y.
|
|
|
1602 |
"""
|
1603 |
# Need to be global as we don't want to recreate/reinstate julia for
|
1604 |
# every new instance of PySRRegressor
|
1605 |
+
global ALREADY_RAN
|
1606 |
|
1607 |
# These are the parameters which may be modified from the ones
|
1608 |
# specified in init, so we define them here locally:
|
1609 |
+
binary_operators = runtime_params.binary_operators
|
1610 |
+
unary_operators = runtime_params.unary_operators
|
1611 |
+
maxdepth = runtime_params.maxdepth
|
1612 |
+
constraints = runtime_params.constraints
|
1613 |
+
multithreading = runtime_params.multithreading
|
1614 |
+
batch_size = runtime_params.batch_size
|
1615 |
+
update_verbosity = runtime_params.update_verbosity
|
1616 |
+
progress = runtime_params.progress
|
1617 |
+
warmup_maxsize_by = runtime_params.warmup_maxsize_by
|
1618 |
+
|
1619 |
nested_constraints = self.nested_constraints
|
1620 |
complexity_of_operators = self.complexity_of_operators
|
|
|
1621 |
cluster_manager = self.cluster_manager
|
|
|
|
|
|
|
1622 |
|
1623 |
# Start julia backend processes
|
1624 |
+
if not ALREADY_RAN and update_verbosity != 0:
|
1625 |
print("Compiling Julia backend...")
|
1626 |
|
1627 |
if cluster_manager is not None:
|
|
|
1660 |
complexity_of_operators_str += f"({k}) => {v}, "
|
1661 |
complexity_of_operators_str += ")"
|
1662 |
complexity_of_operators = jl.seval(complexity_of_operators_str)
|
1663 |
+
# TODO: Refactor this into helper function
|
1664 |
|
1665 |
custom_loss = jl.seval(
|
1666 |
str(self.elementwise_loss)
|
|
|
1697 |
optimize=self.weight_optimize,
|
1698 |
)
|
1699 |
|
1700 |
+
jl_binary_operators: List[Any] = []
|
1701 |
+
jl_unary_operators: List[Any] = []
|
1702 |
+
for input_list, output_list, name in [
|
1703 |
+
(binary_operators, jl_binary_operators, "binary"),
|
1704 |
+
(unary_operators, jl_unary_operators, "unary"),
|
1705 |
+
]:
|
1706 |
+
for op in input_list:
|
1707 |
+
jl_op = jl.seval(op)
|
1708 |
+
if not jl_is_function(jl_op):
|
1709 |
+
raise ValueError(
|
1710 |
+
f"When building `{name}_operators`, `'{op}'` did not return a Julia function"
|
1711 |
+
)
|
1712 |
+
output_list.append(jl_op)
|
1713 |
+
|
1714 |
# Call to Julia backend.
|
1715 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
1716 |
options = SymbolicRegression.Options(
|
1717 |
+
binary_operators=jl_array(jl_binary_operators, dtype=jl.Function),
|
1718 |
+
unary_operators=jl_array(jl_unary_operators, dtype=jl.Function),
|
1719 |
bin_constraints=jl_array(bin_constraints),
|
1720 |
una_constraints=jl_array(una_constraints),
|
1721 |
complexity_of_operators=complexity_of_operators,
|
|
|
1747 |
fraction_replaced_hof=self.fraction_replaced_hof,
|
1748 |
should_simplify=self.should_simplify,
|
1749 |
should_optimize_constants=self.should_optimize_constants,
|
1750 |
+
warmup_maxsize_by=warmup_maxsize_by,
|
|
|
|
|
1751 |
use_frequency=self.use_frequency,
|
1752 |
use_frequency_in_tournament=self.use_frequency_in_tournament,
|
1753 |
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
|
|
|
1854 |
if self.delete_tempfiles:
|
1855 |
shutil.rmtree(self.tempdir_)
|
1856 |
|
1857 |
+
ALREADY_RAN = True
|
1858 |
|
1859 |
return self
|
1860 |
|
|
|
1864 |
y,
|
1865 |
Xresampled=None,
|
1866 |
weights=None,
|
1867 |
+
variable_names: Optional[ArrayLike[str]] = None,
|
1868 |
+
X_units: Optional[ArrayLike[str]] = None,
|
1869 |
+
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
1870 |
) -> "PySRRegressor":
|
1871 |
"""
|
1872 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
1928 |
self.X_units_ = None
|
1929 |
self.y_units_ = None
|
1930 |
|
|
|
|
|
|
|
1931 |
self._setup_equation_file()
|
1932 |
|
1933 |
+
runtime_params = self._validate_and_modify_params()
|
1934 |
|
1935 |
(
|
1936 |
X,
|
|
|
1955 |
"More datapoints will lower the search speed."
|
1956 |
)
|
1957 |
|
1958 |
+
random_state = check_random_state(self.random_state) # For np random
|
1959 |
+
seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
|
1960 |
+
|
1961 |
# Pre transformations (feature selection and denoising)
|
1962 |
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
1963 |
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
|
|
1999 |
self._checkpoint()
|
2000 |
|
2001 |
# Perform the search:
|
2002 |
+
self._run(X, y, runtime_params, weights=weights, seed=seed)
|
2003 |
|
2004 |
# Then, after fit, we save again, so the pickle file contains
|
2005 |
# the equations:
|
|
|
2008 |
|
2009 |
return self
|
2010 |
|
2011 |
+
def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
|
2012 |
"""
|
2013 |
Update self.equations_ with any new options passed.
|
2014 |
|
|
|
2017 |
|
2018 |
Parameters
|
2019 |
----------
|
2020 |
+
checkpoint_file : str or Path
|
2021 |
Path to checkpoint hall of fame file to be loaded.
|
2022 |
The default will use the set `equation_file_`.
|
2023 |
"""
|
2024 |
+
if checkpoint_file is not None:
|
2025 |
self.equation_file_ = checkpoint_file
|
2026 |
self.equation_file_contents_ = None
|
2027 |
check_is_fitted(self, attributes=["equation_file_"])
|
|
|
2073 |
if self.selection_mask_ is not None:
|
2074 |
# RangeIndex enforces column order allowing columns to
|
2075 |
# be correctly filtered with self.selection_mask_
|
2076 |
+
X = X[X.columns[self.selection_mask_]]
|
2077 |
X.columns = self.feature_names_in_
|
2078 |
# Without feature information, CallableEquation/lambda_format equations
|
2079 |
# require that the column order of X matches that of the X used during
|
|
|
2083 |
# reordered/reindexed to match those of the transformed (denoised and
|
2084 |
# feature selected) X in fit.
|
2085 |
X = X.reindex(columns=self.feature_names_in_)
|
2086 |
+
X = self._validate_data_X(X)
|
2087 |
|
2088 |
try:
|
2089 |
+
if isinstance(best_equation, list):
|
2090 |
+
assert self.nout_ > 1
|
2091 |
return np.stack(
|
2092 |
[eq["lambda_format"](X) for eq in best_equation], axis=1
|
2093 |
)
|
2094 |
+
else:
|
2095 |
+
return best_equation["lambda_format"](X)
|
2096 |
except Exception as error:
|
2097 |
raise ValueError(
|
2098 |
"Failed to evaluate the expression. "
|
|
|
2122 |
"""
|
2123 |
self.refresh()
|
2124 |
best_equation = self.get_best(index=index)
|
2125 |
+
if isinstance(best_equation, list):
|
2126 |
+
assert self.nout_ > 1
|
2127 |
return [eq["sympy_format"] for eq in best_equation]
|
2128 |
+
else:
|
2129 |
+
return best_equation["sympy_format"]
|
2130 |
|
2131 |
def latex(self, index=None, precision=3):
|
2132 |
"""
|
|
|
2186 |
self.set_params(output_jax_format=True)
|
2187 |
self.refresh()
|
2188 |
best_equation = self.get_best(index=index)
|
2189 |
+
if isinstance(best_equation, list):
|
2190 |
+
assert self.nout_ > 1
|
2191 |
return [eq["jax_format"] for eq in best_equation]
|
2192 |
+
else:
|
2193 |
+
return best_equation["jax_format"]
|
2194 |
|
2195 |
def pytorch(self, index=None):
|
2196 |
"""
|
|
|
2218 |
self.set_params(output_torch_format=True)
|
2219 |
self.refresh()
|
2220 |
best_equation = self.get_best(index=index)
|
2221 |
+
if isinstance(best_equation, list):
|
2222 |
return [eq["torch_format"] for eq in best_equation]
|
2223 |
+
else:
|
2224 |
+
return best_equation["torch_format"]
|
2225 |
|
2226 |
def _read_equation_file(self):
|
2227 |
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
|
|
|
2320 |
lastComplexity = 0
|
2321 |
sympy_format = []
|
2322 |
lambda_format = []
|
2323 |
+
jax_format = []
|
2324 |
+
torch_format = []
|
|
|
|
|
2325 |
|
2326 |
for _, eqn_row in output.iterrows():
|
2327 |
eqn = pysr2sympy(
|
|
|
2433 |
"""
|
2434 |
self.refresh()
|
2435 |
|
2436 |
+
if isinstance(self.equations_, list):
|
2437 |
if indices is not None:
|
2438 |
assert isinstance(indices, list)
|
2439 |
assert isinstance(indices[0], list)
|
|
|
2442 |
table_string = sympy2multilatextable(
|
2443 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2444 |
)
|
2445 |
+
elif isinstance(self.equations_, pd.DataFrame):
|
2446 |
if indices is not None:
|
2447 |
assert isinstance(indices, list)
|
2448 |
assert isinstance(indices[0], int)
|
|
|
2450 |
table_string = sympy2latextable(
|
2451 |
self.equations_, indices=indices, precision=precision, columns=columns
|
2452 |
)
|
2453 |
+
else:
|
2454 |
+
raise ValueError(
|
2455 |
+
"Invalid type for equations_ to pass to `latex_table`. "
|
2456 |
+
"Expected a DataFrame or a list of DataFrames."
|
2457 |
+
)
|
2458 |
|
2459 |
+
return with_preamble(table_string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2460 |
|
2461 |
|
2462 |
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
|
|
2474 |
f"{model_selection} is not a valid model selection strategy."
|
2475 |
)
|
2476 |
return chosen_idx
|
2477 |
+
|
2478 |
+
|
2479 |
+
def _mutate_parameter(param_name: str, param_value):
|
2480 |
+
if param_name in ["binary_operators", "unary_operators"] and isinstance(
|
2481 |
+
param_value, str
|
2482 |
+
):
|
2483 |
+
return [param_value]
|
2484 |
+
|
2485 |
+
if param_name == "batch_size" and param_value < 1:
|
2486 |
+
warnings.warn(
|
2487 |
+
"Given `batch_size` must be greater than or equal to one. "
|
2488 |
+
"`batch_size` has been increased to equal one."
|
2489 |
+
)
|
2490 |
+
return 1
|
2491 |
+
|
2492 |
+
if (
|
2493 |
+
param_name == "progress"
|
2494 |
+
and param_value == True
|
2495 |
+
and "buffer" not in sys.stdout.__dir__()
|
2496 |
+
):
|
2497 |
+
warnings.warn(
|
2498 |
+
"Note: it looks like you are running in Jupyter. "
|
2499 |
+
"The progress bar will be turned off."
|
2500 |
+
)
|
2501 |
+
return False
|
2502 |
+
|
2503 |
+
return param_value
|
pysr/test/test.py
CHANGED
@@ -431,6 +431,16 @@ class TestPipeline(unittest.TestCase):
|
|
431 |
)
|
432 |
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
|
433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
|
435 |
def manually_create_model(equations, feature_names=None):
|
436 |
if feature_names is None:
|
@@ -526,7 +536,7 @@ class TestFeatureSelection(unittest.TestCase):
|
|
526 |
X = self.rstate.randn(20000, 5)
|
527 |
y = X[:, 2] ** 2 + X[:, 3] ** 2
|
528 |
selected = run_feature_selection(X, y, select_k_features=2)
|
529 |
-
|
530 |
|
531 |
def test_feature_selection_handler(self):
|
532 |
X = self.rstate.randn(20000, 5)
|
@@ -538,8 +548,8 @@ class TestFeatureSelection(unittest.TestCase):
|
|
538 |
variable_names=var_names,
|
539 |
y=y,
|
540 |
)
|
541 |
-
|
542 |
-
selected_var_names = [var_names[i] for i in selection]
|
543 |
self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
|
544 |
np.testing.assert_array_equal(
|
545 |
np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
|
|
|
431 |
)
|
432 |
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
|
433 |
|
434 |
+
def test_jl_function_error(self):
|
435 |
+
# TODO: Move this to better class
|
436 |
+
with self.assertRaises(ValueError) as cm:
|
437 |
+
PySRRegressor(unary_operators=["1"]).fit([[1]], [1])
|
438 |
+
|
439 |
+
self.assertIn(
|
440 |
+
"When building `unary_operators`, `'1'` did not return a Julia function",
|
441 |
+
str(cm.exception),
|
442 |
+
)
|
443 |
+
|
444 |
|
445 |
def manually_create_model(equations, feature_names=None):
|
446 |
if feature_names is None:
|
|
|
536 |
X = self.rstate.randn(20000, 5)
|
537 |
y = X[:, 2] ** 2 + X[:, 3] ** 2
|
538 |
selected = run_feature_selection(X, y, select_k_features=2)
|
539 |
+
np.testing.assert_array_equal(selected, [False, False, True, True, False])
|
540 |
|
541 |
def test_feature_selection_handler(self):
|
542 |
X = self.rstate.randn(20000, 5)
|
|
|
548 |
variable_names=var_names,
|
549 |
y=y,
|
550 |
)
|
551 |
+
np.testing.assert_array_equal(selection, [False, False, True, True, False])
|
552 |
+
selected_var_names = [var_names[i] for i in range(5) if selection[i]]
|
553 |
self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
|
554 |
np.testing.assert_array_equal(
|
555 |
np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
|
pysr/utils.py
CHANGED
@@ -1,10 +1,18 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
|
|
3 |
|
4 |
-
from
|
|
|
5 |
|
|
|
6 |
|
7 |
-
|
|
|
|
|
|
|
|
|
8 |
if os.path.splitext(csv_filename)[1] == ".pkl":
|
9 |
return csv_filename
|
10 |
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Any, List, TypeVar, Union
|
5 |
|
6 |
+
from numpy import ndarray
|
7 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
8 |
|
9 |
+
T = TypeVar("T", bound=Any)
|
10 |
|
11 |
+
ArrayLike = Union[ndarray, List[T]]
|
12 |
+
PathLike = Union[str, Path]
|
13 |
+
|
14 |
+
|
15 |
+
def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
|
16 |
if os.path.splitext(csv_filename)[1] == ".pkl":
|
17 |
return csv_filename
|
18 |
|
requirements.txt
CHANGED
@@ -5,4 +5,3 @@ scikit_learn>=1.0.0,<2.0.0
|
|
5 |
juliacall==0.9.20
|
6 |
click>=7.0.0,<9.0.0
|
7 |
setuptools>=50.0.0
|
8 |
-
typing_extensions>=4.0.0,<5.0.0; python_version < "3.8"
|
|
|
5 |
juliacall==0.9.20
|
6 |
click>=7.0.0,<9.0.0
|
7 |
setuptools>=50.0.0
|
|