MilesCranmer commited on
Commit
d65676d
2 Parent(s): b9f583e a9212fa

Merge pull request #88 from MilesCranmer/sklearn

Browse files
Files changed (16) hide show
  1. .gitignore +2 -1
  2. Dockerfile +1 -2
  3. README.md +69 -45
  4. TODO.md +3 -0
  5. docs/examples.md +11 -11
  6. docs/operators.md +2 -4
  7. docs/options.md +69 -48
  8. docs/start.md +83 -32
  9. example.py +13 -14
  10. pydoc-markdown.yml +15 -1
  11. pysr/__init__.py +1 -1
  12. pysr/sr.py +990 -763
  13. setup.py +6 -3
  14. test/test.py +63 -60
  15. test/test_jax.py +10 -8
  16. test/test_torch.py +19 -10
.gitignore CHANGED
@@ -1,6 +1,7 @@
1
  .dataset*.jl
2
  .hyperparams*.jl
3
  *.csv
 
4
  *.bkup
5
  performance*txt
6
  *.out
@@ -14,4 +15,4 @@ dist
14
  pysr/.vs/
15
  pysr.egg-info
16
  Manifest.toml
17
- workflow
 
1
  .dataset*.jl
2
  .hyperparams*.jl
3
  *.csv
4
+ *.csv.out*
5
  *.bkup
6
  performance*txt
7
  *.out
 
15
  pysr/.vs/
16
  pysr.egg-info
17
  Manifest.toml
18
+ docs/
Dockerfile CHANGED
@@ -13,7 +13,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y \
13
  make build-essential libssl-dev zlib1g-dev \
14
  libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
15
  libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
16
- vim git \
17
  && apt-get clean \
18
  && rm -rf /var/lib/apt/lists/*
19
 
@@ -37,7 +37,6 @@ RUN pip3 install -r /pysr/requirements.txt
37
  # Install PySR:
38
  # We do a minimal copy so it doesn't need to rerun at every file change:
39
  ADD ./setup.py /pysr/setup.py
40
- ADD ./README.md /pysr/README.md
41
  ADD ./pysr/ /pysr/pysr/
42
  RUN pip3 install .
43
 
 
13
  make build-essential libssl-dev zlib1g-dev \
14
  libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
15
  libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
16
+ vim git tmux \
17
  && apt-get clean \
18
  && rm -rf /var/lib/apt/lists/*
19
 
 
37
  # Install PySR:
38
  # We do a minimal copy so it doesn't need to rerun at every file change:
39
  ADD ./setup.py /pysr/setup.py
 
40
  ADD ./pysr/ /pysr/pysr/
41
  RUN pip3 install .
42
 
README.md CHANGED
@@ -74,71 +74,95 @@ Most common issues at this stage are solved
74
  by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
75
  to use up-to-date packages.
76
 
77
- ## Docker
78
-
79
- You can also test out PySR in Docker, without
80
- installing it locally, by running the following command in
81
- the root directory of this repo:
82
- ```bash
83
- docker build --pull --rm -f "Dockerfile" -t pysr "."
84
- ```
85
- This builds an image called `pysr`. You can then run this with:
86
- ```bash
87
- docker run -it --rm -v "$PWD:/data" pysr ipython
88
- ```
89
- which will link the current directory to the container's `/data` directory
90
- and then launch ipython.
91
-
92
  # Quickstart
93
 
94
- Here is some demo code (also found in `example.py`)
 
95
  ```python
96
  import numpy as np
97
- from pysr import pysr, best
98
 
99
- # Dataset
100
  X = 2 * np.random.randn(100, 5)
101
- y = 2 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2
 
 
 
102
 
103
- # Learn equations
104
- equations = pysr(
105
- X,
106
- y,
 
107
  niterations=5,
 
108
  binary_operators=["+", "*"],
109
  unary_operators=[
110
  "cos",
111
  "exp",
112
- "sin", # Pre-defined library of operators (see docs)
113
- "inv(x) = 1/x", # Define your own operator! (Julia syntax)
114
  ],
 
 
115
  )
116
-
117
- ...# (you can use ctl-c to exit early)
118
-
119
- print(best(equations))
120
  ```
 
121
 
122
- which gives:
123
-
124
  ```python
125
- x0**2 + 2.000016*cos(x3) - 1.9999845
126
  ```
 
127
 
128
- The second and additional calls of `pysr` will be significantly
129
- faster in startup time, since the first call to Julia will compile
130
- and cache functions from the symbolic regression backend.
131
 
132
- One can also use `best_tex` to get the LaTeX form,
133
- or `best_callable` to get a function you can call.
134
- This uses a score which balances complexity and error;
135
- however, one can see the full list of equations with:
 
 
 
 
136
  ```python
137
- print(equations)
 
 
 
 
 
 
 
 
138
  ```
139
- This is a pandas table, with additional columns:
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- - `MSE` - the mean square error of the formula
142
- - `score` - a metric akin to Occam's razor; you should use this to help select the "true" equation.
143
- - `sympy_format` - sympy equation.
144
- - `lambda_format` - a lambda function for that equation, that you can pass values through.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
75
  to use up-to-date packages.
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Quickstart
78
 
79
+ Let's create a PySR example. First, let's import
80
+ numpy to generate some test data:
81
  ```python
82
  import numpy as np
 
83
 
 
84
  X = 2 * np.random.randn(100, 5)
85
+ y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5
86
+ ```
87
+ We have created a dataset with 100 datapoints, with 5 features each.
88
+ The relation we wish to model is $2.5382 \cos(x_3) + x_0^2 - 0.5$.
89
 
90
+ Now, let's create a PySR model and train it.
91
+ PySR's main interface is in the style of scikit-learn:
92
+ ```python
93
+ from pysr import PySRRegressor
94
+ model = PySRRegressor(
95
  niterations=5,
96
+ populations=8,
97
  binary_operators=["+", "*"],
98
  unary_operators=[
99
  "cos",
100
  "exp",
101
+ "sin",
102
+ "inv(x) = 1/x", # Custom operator (julia syntax)
103
  ],
104
+ model_selection="best",
105
+ loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax)
106
  )
 
 
 
 
107
  ```
108
+ This will set up the model for 5 iterations of the search code, which contains hundreds of thousands of mutations and equation evaluations.
109
 
110
+ Let's train this model on our dataset:
 
111
  ```python
112
+ model.fit(X, y)
113
  ```
114
+ Internally, this launches a Julia process which will do a multithreaded search for equations to fit the dataset.
115
 
116
+ Equations will be printed during training, and once you are satisfied, you may
117
+ quit early by hitting 'q' and then \<enter\>.
 
118
 
119
+ After the model has been fit, you can run `model.predict(X)`
120
+ to see the predictions on a given dataset.
121
+
122
+ You may run:
123
+ ```python
124
+ print(model)
125
+ ```
126
+ to print the learned equations:
127
  ```python
128
+ PySRRegressor.equations = [
129
+ pick score Equation MSE Complexity
130
+ 0 0.000000 3.5082064 2.710828e+01 1
131
+ 1 0.964260 (x0 * x0) 3.940544e+00 3
132
+ 2 0.030096 (-0.47978288 + (x0 * x0)) 3.710349e+00 5
133
+ 3 0.840770 ((x0 * x0) + cos(x3)) 1.600564e+00 6
134
+ 4 0.928380 ((x0 * x0) + (2.5313091 * cos(x3))) 2.499724e-01 8
135
+ 5 >>>> 13.956461 ((-0.49999997 + (x0 * x0)) + (2.5382001 * cos(... 1.885665e-13 10
136
+ ]
137
  ```
138
+ This arrow in the `pick` column indicates which equation is currently selected by your
139
+ `model_selection` strategy for prediction.
140
+ (You may change `model_selection` after `.fit(X, y)` as well.)
141
+
142
+ `model.equations` is a pandas DataFrame containing all equations, including callable format
143
+ (`lambda_format`),
144
+ SymPy format (`sympy_format`), and even JAX and PyTorch format
145
+ (both of which are differentiable).
146
+
147
+ There are several other useful features such as denoising (e.g., `denoising=True`),
148
+ feature selection (e.g., `select_k_features=3`).
149
+ For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).
150
+ You can see the full API at [this page](https://pysr.readthedocs.io/en/latest/docs/api-documentation/).
151
 
152
+
153
+ # Docker
154
+
155
+ You can also test out PySR in Docker, without
156
+ installing it locally, by running the following command in
157
+ the root directory of this repo:
158
+ ```bash
159
+ docker build --pull --rm -f "Dockerfile" -t pysr "."
160
+ ```
161
+ This builds an image called `pysr`. If you have issues building (for example, on Apple Silicon),
162
+ you can emulate an architecture that works by including: `--platform linux/amd64`.
163
+ You can then run this with:
164
+ ```bash
165
+ docker run -it --rm -v "$PWD:/data" pysr ipython
166
+ ```
167
+ which will link the current directory to the container's `/data` directory
168
+ and then launch ipython.
TODO.md CHANGED
@@ -65,6 +65,9 @@
65
 
66
  - [ ] Automatically convert log, log10, log2, pow to the correct operators.
67
  - [ ] I think the simplification isn't working correctly (post-merging SymbolicUtils.)
 
 
 
68
 
69
  ## Feature ideas
70
 
 
65
 
66
  - [ ] Automatically convert log, log10, log2, pow to the correct operators.
67
  - [ ] I think the simplification isn't working correctly (post-merging SymbolicUtils.)
68
+ - [ ] Show demo of PySRRegressor. Fit equations, then show how to view equations.
69
+ - [ ] Add "selected" column string to regular equations dict.
70
+ - [ ] List "Loss" instead of "MSE"
71
 
72
  ## Feature ideas
73
 
docs/examples.md CHANGED
@@ -23,8 +23,9 @@ find the expression `2 cos(x3) + x0^2 - 2`.
23
  ```python
24
  X = 2 * np.random.randn(100, 5)
25
  y = 2 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2
26
- expressions = pysr(X, y, binary_operators=["+", "-", "*", "/"], **kwargs)
27
- print(best(expressions))
 
28
  ```
29
 
30
  ## 2. Custom operator
@@ -34,14 +35,13 @@ Here, we define a custom operator and use it to find an expression:
34
  ```python
35
  X = 2 * np.random.randn(100, 5)
36
  y = 1 / X[:, 0]
37
- expressions = pysr(
38
- X,
39
- y,
40
  binary_operators=["plus", "mult"],
41
  unary_operators=["inv(x) = 1/x"],
42
  **kwargs
43
  )
44
- print(best(expressions))
 
45
  ```
46
 
47
  ## 3. Multiple outputs
@@ -51,23 +51,23 @@ each requiring a different feature.
51
  ```python
52
  X = 2 * np.random.randn(100, 5)
53
  y = 1 / X[:, [0, 1, 2]]
54
- expressions = pysr(
55
- X,
56
- y,
57
  binary_operators=["plus", "mult"],
58
  unary_operators=["inv(x) = 1/x"],
59
  **kwargs
60
  )
 
61
  ```
62
 
63
  ## 4. Plotting an expression
64
 
65
  Here, let's use the same equations, but get a format we can actually
66
- use and test. We can add this option after a search via the `get_hof`
67
  function:
68
 
69
  ```python
70
- expressions = get_hof(extra_sympy_mappings={"inv": lambda x: 1/x})
 
71
  ```
72
  If you look at the lists of expressions before and after, you will
73
  see that the sympy format now has replaced `inv` with `1/`.
 
23
  ```python
24
  X = 2 * np.random.randn(100, 5)
25
  y = 2 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2
26
+ model = PySRRegressor(binary_operators=["+", "-", "*", "/"], **kwargs)
27
+ model.fit(X, y)
28
+ print(model)
29
  ```
30
 
31
  ## 2. Custom operator
 
35
  ```python
36
  X = 2 * np.random.randn(100, 5)
37
  y = 1 / X[:, 0]
38
+ model = PySRRegressor(
 
 
39
  binary_operators=["plus", "mult"],
40
  unary_operators=["inv(x) = 1/x"],
41
  **kwargs
42
  )
43
+ model.fit(X, y)
44
+ print(model)
45
  ```
46
 
47
  ## 3. Multiple outputs
 
51
  ```python
52
  X = 2 * np.random.randn(100, 5)
53
  y = 1 / X[:, [0, 1, 2]]
54
+ model = PySRRegressor(
 
 
55
  binary_operators=["plus", "mult"],
56
  unary_operators=["inv(x) = 1/x"],
57
  **kwargs
58
  )
59
+ model.fit(X, y)
60
  ```
61
 
62
  ## 4. Plotting an expression
63
 
64
  Here, let's use the same equations, but get a format we can actually
65
+ use and test. We can add this option after a search via the `set_params`
66
  function:
67
 
68
  ```python
69
+ model.set_params(extra_sympy_mappings={"inv": lambda x: 1/x})
70
+ model.sympy()
71
  ```
72
  If you look at the lists of expressions before and after, you will
73
  see that the sympy format now has replaced `inv` with `1/`.
docs/operators.md CHANGED
@@ -49,7 +49,7 @@ Instead of passing a predefined operator as a string,
49
  you can define with by passing it to the `pysr` function, with, e.g.,
50
 
51
  ```python
52
- pysr(
53
  ...,
54
  unary_operators=["myfunction(x) = x^2"],
55
  binary_operators=["myotherfunction(x, y) = x^2*y"]
@@ -57,9 +57,7 @@ you can define with by passing it to the `pysr` function, with, e.g.,
57
  ```
58
 
59
 
60
- You can also define your own in `julia/operators.jl`,
61
- and pass the function name as a string. This is suitable
62
- for more complex functions. Make sure that it works with
63
  `Float32` as a datatype. That means you need to write `1.5f3`
64
  instead of `1.5e3`, if you write any constant numbers.
65
 
 
49
  you can define with by passing it to the `pysr` function, with, e.g.,
50
 
51
  ```python
52
+ PySRRegressor(
53
  ...,
54
  unary_operators=["myfunction(x) = x^2"],
55
  binary_operators=["myotherfunction(x, y) = x^2*y"]
 
57
  ```
58
 
59
 
60
+ Make sure that it works with
 
 
61
  `Float32` as a datatype. That means you need to write `1.5f3`
62
  instead of `1.5e3`, if you write any constant numbers.
63
 
docs/options.md CHANGED
@@ -1,10 +1,8 @@
1
  # Features and Options
2
 
3
- You likely don't need to tune the hyperparameters yourself,
4
- but if you would like, you can use `hyperparamopt.py` as an example.
5
-
6
  Some configurable features and options in `PySR` which you
7
  may find useful include:
 
8
  - `binary_operators`, `unary_operators`
9
  - `niterations`
10
  - `ncyclesperiteration`
@@ -21,18 +19,31 @@ may find useful include:
21
 
22
  These are described below
23
 
24
- The program will output a pandas DataFrame containing the equations,
25
- mean square error, and complexity. It will also dump to a csv
 
 
 
26
  at the end of every iteration,
27
- which is `hall_of_fame_{date_time}.csv` by default. It also prints the
28
- equations to stdout.
 
 
 
 
 
 
 
 
 
 
29
 
30
  ## Operators
31
 
32
  A list of operators can be found on the operators page.
33
  One can define custom operators in Julia by passing a string:
34
  ```python
35
- equations = pysr.pysr(X, y, niterations=100,
36
  binary_operators=["mult", "plus", "special(x, y) = x^2 + y"],
37
  extra_sympy_mappings={'special': lambda x, y: x**2 + y},
38
  unary_operators=["cos"])
@@ -51,8 +62,6 @@ so that the SymPy code can understand the output equation from Julia,
51
  when constructing a useable function. This step is optional, but
52
  is necessary for the `lambda_format` to work.
53
 
54
- One can also edit `operators.jl`.
55
-
56
  ## Iterations
57
 
58
  This is the total number of generations that `pysr` will run for.
@@ -78,15 +87,15 @@ each population stay closer to the best current equations.
78
 
79
  One can adjust the number of workers used by Julia with the
80
  `procs` option. You should set this equal to the number of cores
81
- you want `pysr` to use. This will also run `procs` number of
82
- populations simultaneously by default.
83
 
84
  ## Populations
85
 
86
- By default, `populations=procs`, but you can set a different
87
- number of populations with this option. More populations may increase
 
88
  the diversity of equations discovered, though will take longer to train.
89
- However, it may be more efficient to have `populations>procs`,
90
  as there are multiple populations running
91
  on each core.
92
 
@@ -100,7 +109,8 @@ instead of the usual 4, which creates more populations
100
  sigma = ...
101
  weights = 1/sigma**2
102
 
103
- equations = pysr.pysr(X, y, weights=weights, procs=10)
 
104
  ```
105
 
106
  ## Max size
@@ -147,55 +157,63 @@ expressions of complexity 5 (e.g., 5.0 + x2 exp(x3)).
147
 
148
  ## LaTeX, SymPy
149
 
150
- The `pysr` command will return a pandas dataframe. The `sympy_format`
151
- column gives sympy equations, and the `lambda_format` gives callable
152
- functions. These use the variable names you have provided.
 
 
 
153
 
154
  There are also some helper functions for doing this quickly.
155
- You can call `get_hof()` (or pass an equation file explicitly to this)
156
- to get this pandas dataframe.
157
-
158
- You can call the functions `best()` to get the sympy format
159
- for the best equation, using the `score` column to sort equations.
160
- `best_latex()` returns the LaTeX form of this, and `best_callable()`
161
- returns a callable function.
162
 
163
 
164
  ## Callable exports: numpy, pytorch, jax
165
 
166
  By default, the dataframe of equations will contain columns
167
- with the identifier `lambda_format`. These are simple functions
168
- which correspond to the equation, but executed
169
- with numpy functions. You can pass your `X` matrix to these functions
170
- just as you did to the `pysr` call. Thus, this allows
 
171
  you to numerically evaluate the equations over different output.
172
 
 
 
 
 
 
173
 
174
  One can do the same thing for PyTorch, which uses code
175
  from [sympytorch](https://github.com/patrick-kidger/sympytorch),
176
  and for JAX, which uses code from
177
  [sympy2jax](https://github.com/MilesCranmer/sympy2jax).
178
 
179
- For torch, set the argument `output_torch_format=True`, which
180
- will generate a column `torch_format`. Each element of this column
181
- is a PyTorch module which runs the equation, using PyTorch functions,
182
  over `X` (as a PyTorch tensor). This is differentiable, and the
183
  parameters of this PyTorch module correspond to the learned parameters
184
  in the equation, and are trainable.
 
 
 
 
 
185
 
186
- For jax, set the argument `output_jax_format=True`, which
187
- will generate a column `jax_format`. Each element of this column
188
- is a dictionary containing a `'callable'` (a JAX function),
189
  and `'parameters'` (a list of parameters in the equation).
190
- One can execute this function with: `element['callable'](X, element['parameters'])`.
 
 
 
 
191
  Since the parameter list is a jax array, this therefore lets you also
192
  train the parameters within JAX (and is differentiable).
193
 
194
- If you forget to turn these on when calling the function initially,
195
- you can re-run `get_hof(output_jax_format=True)`, and it will re-use
196
- the equations and other state properties, assuming you haven't
197
- re-run `pysr` in the meantime!
198
-
199
  ## `loss`
200
 
201
  The default loss is mean-square error, and weighted mean-square error.
@@ -209,26 +227,29 @@ Here are some additional examples:
209
 
210
  abs(x-y) loss
211
  ```python
212
- pysr(..., loss="f(x, y) = abs(x - y)^1.5")
213
  ```
214
  Note that the function name doesn't matter:
215
  ```python
216
- pysr(..., loss="loss(x, y) = abs(x * y)")
217
  ```
218
  With weights:
219
  ```python
220
- pysr(..., weights=weights, loss="myloss(x, y, w) = w * abs(x - y)")
 
221
  ```
222
  Weights can be used in arbitrary ways:
223
  ```python
224
- pysr(..., weights=weights, loss="myloss(x, y, w) = abs(x - y)^2/w^2")
 
225
  ```
226
  Built-in loss (faster) (see [losses](https://astroautomata.com/SymbolicRegression.jl/dev/losses/)).
227
  This one computes the L3 norm:
228
  ```python
229
- pysr(..., loss="LPDistLoss{3}()")
230
  ```
231
  Can also uses these losses for weighted (weighted-average):
232
  ```python
233
- pysr(..., weights=weights, loss="LPDistLoss{3}()")
 
234
  ```
 
1
  # Features and Options
2
 
 
 
 
3
  Some configurable features and options in `PySR` which you
4
  may find useful include:
5
+ - `model_selection`
6
  - `binary_operators`, `unary_operators`
7
  - `niterations`
8
  - `ncyclesperiteration`
 
19
 
20
  These are described below
21
 
22
+ The program will output a pandas DataFrame containing the equations
23
+ to `PySRRegressor.equations` containing the loss value
24
+ and complexity.
25
+
26
+ It will also dump to a csv
27
  at the end of every iteration,
28
+ which is `hall_of_fame_{date_time}.csv` by default.
29
+ It also prints the equations to stdout.
30
+
31
+ ## Model selection
32
+
33
+ By default, `PySRRegressor` uses `model_selection='best'`
34
+ which selects an equation from `PySRRegressor.equations` using
35
+ a combination of accuracy and complexity.
36
+ You can also select `model_selection='accuracy'`.
37
+
38
+ By printing a model (i.e., `print(model)`), you can see
39
+ the equation selection with the arrow shown in the `pick` column.
40
 
41
  ## Operators
42
 
43
  A list of operators can be found on the operators page.
44
  One can define custom operators in Julia by passing a string:
45
  ```python
46
+ PySRRegressor(niterations=100,
47
  binary_operators=["mult", "plus", "special(x, y) = x^2 + y"],
48
  extra_sympy_mappings={'special': lambda x, y: x**2 + y},
49
  unary_operators=["cos"])
 
62
  when constructing a useable function. This step is optional, but
63
  is necessary for the `lambda_format` to work.
64
 
 
 
65
  ## Iterations
66
 
67
  This is the total number of generations that `pysr` will run for.
 
87
 
88
  One can adjust the number of workers used by Julia with the
89
  `procs` option. You should set this equal to the number of cores
90
+ you want `pysr` to use.
 
91
 
92
  ## Populations
93
 
94
+ By default, `populations=20`, but you can set a different
95
+ number of populations with this option.
96
+ More populations may increase
97
  the diversity of equations discovered, though will take longer to train.
98
+ However, it is usually more efficient to have `populations>procs`,
99
  as there are multiple populations running
100
  on each core.
101
 
 
109
  sigma = ...
110
  weights = 1/sigma**2
111
 
112
+ model = PySRRegressor(procs=10)
113
+ model.fit(X, y, weights=weights)
114
  ```
115
 
116
  ## Max size
 
157
 
158
  ## LaTeX, SymPy
159
 
160
+ After running `model.fit(...)`, you can look at
161
+ `model.equations` which is a pandas dataframe.
162
+ The `sympy_format` column gives sympy equations,
163
+ and the `lambda_format` gives callable functions.
164
+ You can optionally pass a pandas dataframe to the callable function,
165
+ if you called `.fit` on a pandas dataframe as well.
166
 
167
  There are also some helper functions for doing this quickly.
168
+ - `model.latex()` will generate a TeX formatted output of your equation.
169
+ - `model.sympy()` will return the SymPy representation.
170
+ - `model.jax()` will return a callable JAX function combined with parameters (see below)
171
+ - `model.pytorch()` will return a PyTorch model (see below).
 
 
 
172
 
173
 
174
  ## Callable exports: numpy, pytorch, jax
175
 
176
  By default, the dataframe of equations will contain columns
177
+ with the identifier `lambda_format`.
178
+ These are simple functions which correspond to the equation, but executed
179
+ with numpy functions.
180
+ You can pass your `X` matrix to these functions
181
+ just as you did to the `model.fit` call. Thus, this allows
182
  you to numerically evaluate the equations over different output.
183
 
184
+ Calling `model.predict` will execute the `lambda_format` of
185
+ the best equation, and return the result. If you selected
186
+ `model_selection="best"`, this will use an equation that combines
187
+ accuracy with simplicity. For `model_selection="accuracy"`, this will just
188
+ look at accuracy.
189
 
190
  One can do the same thing for PyTorch, which uses code
191
  from [sympytorch](https://github.com/patrick-kidger/sympytorch),
192
  and for JAX, which uses code from
193
  [sympy2jax](https://github.com/MilesCranmer/sympy2jax).
194
 
195
+ Calling `model.pytorch()` will return
196
+ a PyTorch module which runs the equation, using PyTorch functions,
 
197
  over `X` (as a PyTorch tensor). This is differentiable, and the
198
  parameters of this PyTorch module correspond to the learned parameters
199
  in the equation, and are trainable.
200
+ ```python
201
+ torch_model = model.pytorch()
202
+ torch_model(X)
203
+ ```
204
+ **Warning: If you are using custom operators, you must define `extra_torch_mappings` or `extra_jax_mappings` (both are `dict` of callables) to provide an equivalent definition of the functions.** (At any time you can set these parameters or any others with `model.set_params`.)
205
 
206
+ For JAX, you can equivalently call `model.jax()`
207
+ This will return a dictionary containing a `'callable'` (a JAX function),
 
208
  and `'parameters'` (a list of parameters in the equation).
209
+ You can execute this function with:
210
+ ```python
211
+ jax_model = model.jax()
212
+ jax_model['callable'](X, jax_model['parameters'])
213
+ ```
214
  Since the parameter list is a jax array, this therefore lets you also
215
  train the parameters within JAX (and is differentiable).
216
 
 
 
 
 
 
217
  ## `loss`
218
 
219
  The default loss is mean-square error, and weighted mean-square error.
 
227
 
228
  abs(x-y) loss
229
  ```python
230
+ PySRRegressor(..., loss="f(x, y) = abs(x - y)^1.5")
231
  ```
232
  Note that the function name doesn't matter:
233
  ```python
234
+ PySRRegressor(..., loss="loss(x, y) = abs(x * y)")
235
  ```
236
  With weights:
237
  ```python
238
+ model = PySRRegressor(..., loss="myloss(x, y, w) = w * abs(x - y)")
239
+ model.fit(..., weights=weights)
240
  ```
241
  Weights can be used in arbitrary ways:
242
  ```python
243
+ model = PySRRegressor(..., weights=weights, loss="myloss(x, y, w) = abs(x - y)^2/w^2")
244
+ model.fit(..., weights=weights)
245
  ```
246
  Built-in loss (faster) (see [losses](https://astroautomata.com/SymbolicRegression.jl/dev/losses/)).
247
  This one computes the L3 norm:
248
  ```python
249
+ PySRRegressor(..., loss="LPDistLoss{3}()")
250
  ```
251
  Can also uses these losses for weighted (weighted-average):
252
  ```python
253
+ model = PySRRegressor(..., weights=weights, loss="LPDistLoss{3}()")
254
+ model.fit(..., weights=weights)
255
  ```
docs/start.md CHANGED
@@ -1,6 +1,4 @@
1
- # Getting Started
2
-
3
- ## Installation
4
  PySR uses both Julia and Python, so you need to have both installed.
5
 
6
  Install Julia - see [downloads](https://julialang.org/downloads/), and
@@ -16,47 +14,100 @@ python3 -c 'import pysr; pysr.install()'
16
  The second line will install and update the required Julia packages, including
17
  `PyCall.jl`.
18
 
19
- ## Quickstart
20
 
21
- ```python
22
- import numpy as np
23
- from pysr import pysr, best, get_hof
24
 
25
- # Dataset
26
- X = 2*np.random.randn(100, 5)
27
- y = 2*np.cos(X[:, 3]) + X[:, 0]**2 - 2
28
 
29
- # Learn equations
30
- equations = pysr(X, y, niterations=5,
31
- binary_operators=["plus", "mult"],
32
- unary_operators=["cos", "exp", "sin"])
33
-
34
- ...# (you can use ctl-c to exit early)
35
 
36
- print(best())
 
37
  ```
 
 
38
 
39
- which gives:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
 
41
  ```python
42
- x0**2 + 2.000016*cos(x3) - 1.9999845
43
  ```
 
 
 
 
44
 
45
- The second and additional calls of `pysr` will be significantly
46
- faster in startup time, since the first call to Julia will compile
47
- and cache functions from the symbolic regression backend.
48
 
49
- One can also use `best_tex` to get the LaTeX form,
50
- or `best_callable` to get a function you can call.
51
- This uses a score which balances complexity and error;
52
- however, one can see the full list of equations with:
53
  ```python
54
- print(get_hof())
55
  ```
56
- This is a pandas table, with additional columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- - `MSE` - the mean square error of the formula
59
- - `score` - a metric akin to Occam's razor; you should use this to help select the "true" equation.
60
- - `sympy_format` - sympy equation.
61
- - `lambda_format` - a lambda function for that equation, that you can pass values through.
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation
 
 
2
  PySR uses both Julia and Python, so you need to have both installed.
3
 
4
  Install Julia - see [downloads](https://julialang.org/downloads/), and
 
14
  The second line will install and update the required Julia packages, including
15
  `PyCall.jl`.
16
 
 
17
 
18
+ Most common issues at this stage are solved
19
+ by [tweaking the Julia package server](https://github.com/MilesCranmer/PySR/issues/27).
20
+ to use up-to-date packages.
21
 
22
+ # Quickstart
 
 
23
 
24
+ Let's create a PySR example. First, let's import
25
+ numpy to generate some test data:
26
+ ```python
27
+ import numpy as np
 
 
28
 
29
+ X = 2 * np.random.randn(100, 5)
30
+ y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5
31
  ```
32
+ We have created a dataset with 100 datapoints, with 5 features each.
33
+ The relation we wish to model is $2.5382 \cos(x_3) + x_0^2 - 0.5$.
34
 
35
+ Now, let's create a PySR model and train it.
36
+ PySR's main interface is in the style of scikit-learn:
37
+ ```python
38
+ from pysr import PySRRegressor
39
+ model = PySRRegressor(
40
+ niterations=5,
41
+ populations=8,
42
+ binary_operators=["+", "*"],
43
+ unary_operators=[
44
+ "cos",
45
+ "exp",
46
+ "sin",
47
+ "inv(x)=1/x", # Custom operator (julia syntax)
48
+ ],
49
+ model_selection="best",
50
+ loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax)
51
+ )
52
+ ```
53
+ This will set up the model for 5 iterations of the search code, which contains hundreds of thousands of mutations and equation evaluations.
54
 
55
+ Let's train this model on our dataset:
56
  ```python
57
+ model.fit(X, y)
58
  ```
59
+ Internally, this launches a Julia process which will do a multithreaded search for equations to fit the dataset.
60
+
61
+ Equations will be printed during training, and once you are satisfied, you may
62
+ quit early by hitting 'q' and then \<enter\>.
63
 
64
+ After the model has been fit, you can run `model.predict(X)`
65
+ to see the predictions on a given dataset.
 
66
 
67
+ You may run:
 
 
 
68
  ```python
69
+ print(model)
70
  ```
71
+ to print the learned equations:
72
+ ```python
73
+ PySRRegressor.equations = [
74
+ pick score equation loss complexity
75
+ 0 0.000000 3.0282464 2.816982e+01 1
76
+ 1 1.008026 (x0 * x0) 3.751666e+00 3
77
+ 2 0.015337 (-0.33649465 + (x0 * x0)) 3.638336e+00 5
78
+ 3 0.888050 ((x0 * x0) + cos(x3)) 1.497019e+00 6
79
+ 4 0.898539 ((x0 * x0) + (2.4816332 * cos(x3))) 2.481797e-01 8
80
+ 5 >>>> 10.604434 ((-0.49998775 + (x0 * x0)) + (2.5382009 * cos(... 1.527115e-10 10
81
+ ]
82
+ ```
83
+ This arrow in the `pick` column indicates which equation is currently selected by your
84
+ `model_selection` strategy for prediction.
85
+ (You may change `model_selection` after `.fit(X, y)` as well.)
86
+
87
+ `model.equations` is a pandas DataFrame containing all equations, including callable format
88
+ (`lambda_format`),
89
+ SymPy format (`sympy_format`), and even JAX and PyTorch format
90
+ (both of which are differentiable).
91
+
92
+ There are several other useful features such as denoising (e.g., `denoising=True`),
93
+ feature selection (e.g., `select_k_features=3`), and many others.
94
+ For a summary of features and options, see [this docs page](https://pysr.readthedocs.io/en/latest/docs/options/).
95
+ You can see the full API at [this page](https://pysr.readthedocs.io/en/latest/docs/api-documentation/).
96
+
97
 
98
+ # Docker
 
 
 
99
 
100
+ You can also test out PySR in Docker, without
101
+ installing it locally, by running the following command in
102
+ the root directory of this repo:
103
+ ```bash
104
+ docker build --pull --rm -f "Dockerfile" -t pysr "."
105
+ ```
106
+ This builds an image called `pysr`. If you have issues building (for example, on Apple Silicon),
107
+ you can emulate an architecture that works by including: `--platform linux/amd64`.
108
+ You can then run this with:
109
+ ```bash
110
+ docker run -it --rm -v "$PWD:/data" pysr ipython
111
+ ```
112
+ which will link the current directory to the container's `/data` directory
113
+ and then launch ipython.
example.py CHANGED
@@ -1,25 +1,24 @@
1
  import numpy as np
2
- from pysr import pysr, best
3
 
4
- # Dataset
5
  X = 2 * np.random.randn(100, 5)
6
- y = 2 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2
7
 
8
- # Learn equations
9
- equations = pysr(
10
- X,
11
- y,
12
  niterations=5,
13
- binary_operators=["plus", "mult"],
 
14
  unary_operators=[
15
  "cos",
16
  "exp",
17
- "sin", # Pre-defined library of operators (see https://pysr.readthedocs.io/en/latest/docs/operators/)
18
- "inv(x) = 1/x",
19
  ],
20
- loss="loss(x, y) = abs(x - y)", # Custom loss function
21
- ) # Define your own operator! (Julia syntax)
 
22
 
23
- ... # (you can use ctl-c to exit early)
24
 
25
- print(best(equations))
 
1
  import numpy as np
 
2
 
 
3
  X = 2 * np.random.randn(100, 5)
4
+ y = 2.5382 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 0.5
5
 
6
+ from pysr import PySRRegressor
7
+
8
+ model = PySRRegressor(
 
9
  niterations=5,
10
+ populations=8,
11
+ binary_operators=["+", "*"],
12
  unary_operators=[
13
  "cos",
14
  "exp",
15
+ "sin",
16
+ "inv(x) = 1/x", # Custom operator (julia syntax)
17
  ],
18
+ model_selection="best",
19
+ loss="loss(x, y) = (x - y)^2", # Custom loss function (julia syntax)
20
+ )
21
 
22
+ model.fit(X, y)
23
 
24
+ print(model)
pydoc-markdown.yml CHANGED
@@ -54,5 +54,19 @@ renderer:
54
  preamble: {weight: 4}
55
  - title: API Documentation
56
  contents:
57
- - pysr.sr.*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  preamble: {weight: 5}
 
54
  preamble: {weight: 4}
55
  - title: API Documentation
56
  contents:
57
+ - pysr.sr.PySRRegressor.__init__
58
+ - pysr.sr.PySRRegressor.fit
59
+ - pysr.sr.PySRRegressor.predict
60
+ - pysr.sr.PySRRegressor.__repr__
61
+ - pysr.sr.PySRRegressor.set_params
62
+ - pysr.sr.PySRRegressor.get_params
63
+ - pysr.sr.PySRRegressor.get_best
64
+ - pysr.sr.PySRRegressor.sympy
65
+ - pysr.sr.PySRRegressor.latex
66
+ - pysr.sr.PySRRegressor.jax
67
+ - pysr.sr.PySRRegressor.pytorch
68
+ - pysr.sr.PySRRegressor.refresh
69
+ - pysr.sr.__repr__
70
+ - pysr.sr.install
71
+ - pysr.sr.silence_julia_warning
72
  preamble: {weight: 5}
pysr/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
  from .sr import (
2
  pysr,
3
- get_hof,
4
  best,
5
  best_tex,
6
  best_callable,
 
1
  from .sr import (
2
  pysr,
3
+ PySRRegressor,
4
  best,
5
  best_tex,
6
  best_callable,
pysr/sr.py CHANGED
@@ -11,11 +11,15 @@ from pathlib import Path
11
  from datetime import datetime
12
  import warnings
13
  from multiprocessing import cpu_count
 
14
 
15
  is_julia_warning_silenced = False
16
 
17
 
18
  def install(julia_project=None): # pragma: no cover
 
 
 
19
  import julia
20
 
21
  julia.install()
@@ -36,20 +40,6 @@ def install(julia_project=None): # pragma: no cover
36
 
37
 
38
  Main = None
39
- global_state = dict(
40
- equation_file="hall_of_fame.csv",
41
- n_features=None,
42
- variable_names=[],
43
- extra_sympy_mappings={},
44
- extra_torch_mappings={},
45
- extra_jax_mappings={},
46
- output_jax_format=False,
47
- output_torch_format=False,
48
- multioutput=False,
49
- nout=1,
50
- selection=None,
51
- raw_julia_output=None,
52
- )
53
 
54
  already_ran = False
55
 
@@ -93,533 +83,14 @@ sympy_mappings = {
93
  }
94
 
95
 
96
- def pysr(
97
- X,
98
- y,
99
- weights=None,
100
- binary_operators=None,
101
- unary_operators=None,
102
- procs=cpu_count(),
103
- loss="L2DistLoss()",
104
- populations=20,
105
- niterations=100,
106
- ncyclesperiteration=300,
107
- alpha=0.1,
108
- annealing=False,
109
- fractionReplaced=0.10,
110
- fractionReplacedHof=0.10,
111
- npop=1000,
112
- parsimony=1e-4,
113
- migration=True,
114
- hofMigration=True,
115
- shouldOptimizeConstants=True,
116
- topn=10,
117
- weightAddNode=1,
118
- weightInsertNode=3,
119
- weightDeleteNode=3,
120
- weightDoNothing=1,
121
- weightMutateConstant=10,
122
- weightMutateOperator=1,
123
- weightRandomize=1,
124
- weightSimplify=0.002,
125
- perturbationFactor=1.0,
126
- extra_sympy_mappings=None,
127
- extra_torch_mappings=None,
128
- extra_jax_mappings=None,
129
- equation_file=None,
130
- verbosity=1e9,
131
- progress=None,
132
- maxsize=20,
133
- fast_cycle=False,
134
- maxdepth=None,
135
- variable_names=None,
136
- batching=False,
137
- batchSize=50,
138
- select_k_features=None,
139
- warmupMaxsizeBy=0.0,
140
- constraints=None,
141
- useFrequency=True,
142
- tempdir=None,
143
- delete_tempfiles=True,
144
- julia_project=None,
145
- update=True,
146
- temp_equation_file=False,
147
- output_jax_format=False,
148
- output_torch_format=False,
149
- optimizer_algorithm="BFGS",
150
- optimizer_nrestarts=3,
151
- optimize_probability=1.0,
152
- optimizer_iterations=10,
153
- tournament_selection_n=10,
154
- tournament_selection_p=1.0,
155
- denoise=False,
156
- Xresampled=None,
157
- precision=32,
158
- multithreading=None,
159
- **kwargs,
160
- ):
161
- """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
162
- Note: most default parameters have been tuned over several example
163
- equations, but you should adjust `niterations`,
164
- `binary_operators`, `unary_operators` to your requirements.
165
- You can view more detailed explanations of the options on the
166
- [options page](https://pysr.readthedocs.io/en/latest/docs/options/) of the documentation.
167
-
168
- :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
169
- :type X: np.ndarray/pandas.DataFrame
170
- :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y.
171
- :type y: np.ndarray
172
- :param weights: same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
173
- :type weights: np.ndarray
174
- :param binary_operators: List of strings giving the binary operators in Julia's Base. Default is ["+", "-", "*", "/",].
175
- :type binary_operators: list
176
- :param unary_operators: Same but for operators taking a single scalar. Default is [].
177
- :type unary_operators: list
178
- :param procs: Number of processes (=number of populations running).
179
- :type procs: int
180
- :param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
181
- :type loss: str
182
- :param populations: Number of populations running.
183
- :type populations: int
184
- :param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
185
- :type niterations: int
186
- :param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
187
- :type ncyclesperiteration: int
188
- :param alpha: Initial temperature.
189
- :type alpha: float
190
- :param annealing: Whether to use annealing. You should (and it is default).
191
- :type annealing: bool
192
- :param fractionReplaced: How much of population to replace with migrating equations from other populations.
193
- :type fractionReplaced: float
194
- :param fractionReplacedHof: How much of population to replace with migrating equations from hall of fame.
195
- :type fractionReplacedHof: float
196
- :param npop: Number of individuals in each population
197
- :type npop: int
198
- :param parsimony: Multiplicative factor for how much to punish complexity.
199
- :type parsimony: float
200
- :param migration: Whether to migrate.
201
- :type migration: bool
202
- :param hofMigration: Whether to have the hall of fame migrate.
203
- :type hofMigration: bool
204
- :param shouldOptimizeConstants: Whether to numerically optimize constants (Nelder-Mead/Newton) at the end of each iteration.
205
- :type shouldOptimizeConstants: bool
206
- :param topn: How many top individuals migrate from each population.
207
- :type topn: int
208
- :param perturbationFactor: Constants are perturbed by a max factor of (perturbationFactor*T + 1). Either multiplied by this or divided by this.
209
- :type perturbationFactor: float
210
- :param weightAddNode: Relative likelihood for mutation to add a node
211
- :type weightAddNode: float
212
- :param weightInsertNode: Relative likelihood for mutation to insert a node
213
- :type weightInsertNode: float
214
- :param weightDeleteNode: Relative likelihood for mutation to delete a node
215
- :type weightDeleteNode: float
216
- :param weightDoNothing: Relative likelihood for mutation to leave the individual
217
- :type weightDoNothing: float
218
- :param weightMutateConstant: Relative likelihood for mutation to change the constant slightly in a random direction.
219
- :type weightMutateConstant: float
220
- :param weightMutateOperator: Relative likelihood for mutation to swap an operator.
221
- :type weightMutateOperator: float
222
- :param weightRandomize: Relative likelihood for mutation to completely delete and then randomly generate the equation
223
- :type weightRandomize: float
224
- :param weightSimplify: Relative likelihood for mutation to simplify constant parts by evaluation
225
- :type weightSimplify: float
226
- :param equation_file: Where to save the files (.csv separated by |)
227
- :type equation_file: str
228
- :param verbosity: What verbosity level to use. 0 means minimal print statements.
229
- :type verbosity: int
230
- :param progress: Whether to use a progress bar instead of printing to stdout.
231
- :type progress: bool
232
- :param maxsize: Max size of an equation.
233
- :type maxsize: int
234
- :param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
235
- :type maxdepth: int
236
- :param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
237
- :type fast_cycle: bool
238
- :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
239
- :type variable_names: list
240
- :param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
241
- :type batching: bool
242
- :param batchSize: the amount of data to use if doing batching.
243
- :type batchSize: int
244
- :param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
245
- :type select_k_features: None/int
246
- :param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
247
- :type warmupMaxsizeBy: float
248
- :param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
249
- :type constraints: dict
250
- :param useFrequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities.
251
- :type useFrequency: bool
252
- :param tempdir: directory for the temporary files
253
- :type tempdir: str/None
254
- :param delete_tempfiles: whether to delete the temporary files after finishing
255
- :type delete_tempfiles: bool
256
- :param julia_project: a Julia environment location containing a Project.toml (and potentially the source code for SymbolicRegression.jl). Default gives the Python package directory, where a Project.toml file should be present from the install.
257
- :type julia_project: str/None
258
- :param update: Whether to automatically update Julia packages.
259
- :type update: bool
260
- :param temp_equation_file: Whether to put the hall of fame file in the temp directory. Deletion is then controlled with the delete_tempfiles argument.
261
- :type temp_equation_file: bool
262
- :param output_jax_format: Whether to create a 'jax_format' column in the output, containing jax-callable functions and the default parameters in a jax array.
263
- :type output_jax_format: bool
264
- :param output_torch_format: Whether to create a 'torch_format' column in the output, containing a torch module with trainable parameters.
265
- :type output_torch_format: bool
266
- :param tournament_selection_n: Number of expressions to consider in each tournament.
267
- :type tournament_selection_n: int
268
- :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
269
- :type tournament_selection_p: float
270
- :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
271
- :type denoise: bool
272
- :param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
273
- :type precision: int
274
- :param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
275
- :type multithreading: bool
276
- :param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
277
- :type **kwargs: dict
278
- :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
279
- :type: pd.DataFrame/list
280
- """
281
- global already_ran
282
-
283
- if binary_operators is None:
284
- binary_operators = "+ * - /".split(" ")
285
- if unary_operators is None:
286
- unary_operators = []
287
- if extra_sympy_mappings is None:
288
- extra_sympy_mappings = {}
289
- if variable_names is None:
290
- variable_names = []
291
- if constraints is None:
292
- constraints = {}
293
- if multithreading is None:
294
- # Default is multithreading=True, unless explicitly set,
295
- # or procs is set to 0 (serial mode).
296
- multithreading = procs != 0
297
-
298
- global Main
299
- if Main is None:
300
- if multithreading:
301
- os.environ["JULIA_NUM_THREADS"] = str(procs)
302
-
303
- Main = init_julia()
304
-
305
- buffer_available = "buffer" in sys.stdout.__dir__()
306
-
307
- if progress is not None:
308
- if progress and not buffer_available:
309
- warnings.warn(
310
- "Note: it looks like you are running in Jupyter. The progress bar will be turned off."
311
- )
312
- progress = False
313
- else:
314
- progress = buffer_available
315
-
316
- assert optimizer_algorithm in ["NelderMead", "BFGS"]
317
- assert tournament_selection_n < npop
318
-
319
- if isinstance(X, pd.DataFrame):
320
- variable_names = list(X.columns)
321
- X = np.array(X)
322
-
323
- if len(X.shape) == 1:
324
- X = X[:, None]
325
-
326
- assert not isinstance(y, pd.DataFrame)
327
-
328
- if len(variable_names) == 0:
329
- variable_names = [f"x{i}" for i in range(X.shape[1])]
330
-
331
- if extra_jax_mappings is not None:
332
- for value in extra_jax_mappings.values():
333
- if not isinstance(value, str):
334
- raise NotImplementedError(
335
- "extra_jax_mappings must have keys that are strings! e.g., {sympy.sqrt: 'jnp.sqrt'}."
336
- )
337
-
338
- if extra_torch_mappings is not None:
339
- for value in extra_jax_mappings.values():
340
- if not callable(value):
341
- raise NotImplementedError(
342
- "extra_torch_mappings must be callable functions! e.g., {sympy.sqrt: torch.sqrt}."
343
- )
344
-
345
- use_custom_variable_names = len(variable_names) != 0
346
- # TODO: this is always true.
347
-
348
- _check_assertions(
349
- X,
350
- binary_operators,
351
- unary_operators,
352
- use_custom_variable_names,
353
- variable_names,
354
- weights,
355
- y,
356
- )
357
-
358
- if len(X) > 10000 and not batching:
359
- warnings.warn(
360
- "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://pysr.readthedocs.io/en/latest/docs/options/#batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed."
361
- )
362
-
363
- if maxsize > 40:
364
- warnings.warn(
365
- "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory. You should consider turning `useFrequency` to False, and perhaps use `warmupMaxsizeBy`."
366
- )
367
- if maxsize < 7:
368
- raise NotImplementedError("PySR requires a maxsize of at least 7")
369
-
370
- X, selection = _handle_feature_selection(X, select_k_features, y, variable_names)
371
-
372
- if maxdepth is None:
373
- maxdepth = maxsize
374
- if isinstance(binary_operators, str):
375
- binary_operators = [binary_operators]
376
- if isinstance(unary_operators, str):
377
- unary_operators = [unary_operators]
378
-
379
- if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
380
- multioutput = False
381
- nout = 1
382
- y = y.reshape(-1)
383
- elif len(y.shape) == 2:
384
- multioutput = True
385
- nout = y.shape[1]
386
- else:
387
- raise NotImplementedError("y shape not supported!")
388
-
389
- if denoise:
390
- if weights is not None:
391
- raise NotImplementedError(
392
- "No weights for denoising - the weights are learned."
393
- )
394
- if Xresampled is not None:
395
- # Select among only the selected features:
396
- if isinstance(Xresampled, pd.DataFrame):
397
- # Handle Xresampled is pandas dataframe
398
- if selection is not None:
399
- Xresampled = Xresampled[[variable_names[i] for i in selection]]
400
- else:
401
- Xresampled = Xresampled[variable_names]
402
- Xresampled = np.array(Xresampled)
403
- else:
404
- if selection is not None:
405
- Xresampled = Xresampled[:, selection]
406
- if multioutput:
407
- y = np.stack(
408
- [_denoise(X, y[:, i], Xresampled=Xresampled)[1] for i in range(nout)],
409
- axis=1,
410
- )
411
- if Xresampled is not None:
412
- X = Xresampled
413
- else:
414
- X, y = _denoise(X, y, Xresampled=Xresampled)
415
-
416
- julia_project = _get_julia_project(julia_project)
417
-
418
- tmpdir = Path(tempfile.mkdtemp(dir=tempdir))
419
-
420
- if temp_equation_file:
421
- equation_file = tmpdir / "hall_of_fame.csv"
422
- elif equation_file is None:
423
- date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
424
- equation_file = "hall_of_fame_" + date_time + ".csv"
425
-
426
- _create_inline_operators(
427
- binary_operators=binary_operators, unary_operators=unary_operators
428
- )
429
- _handle_constraints(
430
- binary_operators=binary_operators,
431
- unary_operators=unary_operators,
432
- constraints=constraints,
433
- )
434
-
435
- una_constraints = [constraints[op] for op in unary_operators]
436
- bin_constraints = [constraints[op] for op in binary_operators]
437
-
438
- try:
439
- # TODO: is this needed since Julia now prints directly to stdout?
440
- term_width = shutil.get_terminal_size().columns
441
- except:
442
- _, term_width = subprocess.check_output(["stty", "size"]).split()
443
-
444
- if not already_ran:
445
- from julia import Pkg
446
-
447
- Pkg.activate(f"{_escape_filename(julia_project)}")
448
- try:
449
- if update:
450
- Pkg.resolve()
451
- Pkg.instantiate()
452
- else:
453
- Pkg.instantiate()
454
- except RuntimeError as e:
455
- raise ImportError(
456
- f"""
457
- Required dependencies are not installed or built. Run the following code in the Python REPL:
458
-
459
- >>> import pysr
460
- >>> pysr.install()
461
-
462
- Tried to activate project {julia_project} but failed."""
463
- ) from e
464
- Main.eval("using SymbolicRegression")
465
-
466
- Main.plus = Main.eval("(+)")
467
- Main.sub = Main.eval("(-)")
468
- Main.mult = Main.eval("(*)")
469
- Main.pow = Main.eval("(^)")
470
- Main.div = Main.eval("(/)")
471
-
472
- Main.custom_loss = Main.eval(loss)
473
-
474
- mutationWeights = [
475
- float(weightMutateConstant),
476
- float(weightMutateOperator),
477
- float(weightAddNode),
478
- float(weightInsertNode),
479
- float(weightDeleteNode),
480
- float(weightSimplify),
481
- float(weightRandomize),
482
- float(weightDoNothing),
483
- ]
484
-
485
- options = Main.Options(
486
- binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
487
- unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
488
- bin_constraints=bin_constraints,
489
- una_constraints=una_constraints,
490
- parsimony=float(parsimony),
491
- loss=Main.custom_loss,
492
- alpha=float(alpha),
493
- maxsize=int(maxsize),
494
- maxdepth=int(maxdepth),
495
- fast_cycle=fast_cycle,
496
- migration=migration,
497
- hofMigration=hofMigration,
498
- fractionReplacedHof=float(fractionReplacedHof),
499
- shouldOptimizeConstants=shouldOptimizeConstants,
500
- hofFile=_escape_filename(equation_file),
501
- npopulations=int(populations),
502
- optimizer_algorithm=optimizer_algorithm,
503
- optimizer_nrestarts=int(optimizer_nrestarts),
504
- optimize_probability=float(optimize_probability),
505
- optimizer_iterations=int(optimizer_iterations),
506
- perturbationFactor=float(perturbationFactor),
507
- annealing=annealing,
508
- batching=batching,
509
- batchSize=int(min([batchSize, len(X)]) if batching else len(X)),
510
- mutationWeights=mutationWeights,
511
- warmupMaxsizeBy=float(warmupMaxsizeBy),
512
- useFrequency=useFrequency,
513
- npop=int(npop),
514
- ns=int(tournament_selection_n),
515
- probPickFirst=float(tournament_selection_p),
516
- ncyclesperiteration=int(ncyclesperiteration),
517
- fractionReplaced=float(fractionReplaced),
518
- topn=int(topn),
519
- verbosity=int(verbosity),
520
- progress=progress,
521
- terminal_width=int(term_width),
522
- **kwargs,
523
- )
524
-
525
- np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[precision]
526
-
527
- Main.X = np.array(X, dtype=np_dtype).T
528
- if len(y.shape) == 1:
529
- Main.y = np.array(y, dtype=np_dtype)
530
- else:
531
- Main.y = np.array(y, dtype=np_dtype).T
532
- if weights is not None:
533
- if len(weights.shape) == 1:
534
- Main.weights = np.array(weights, dtype=np_dtype)
535
- else:
536
- Main.weights = np.array(weights, dtype=np_dtype).T
537
- else:
538
- Main.weights = None
539
-
540
- cprocs = 0 if multithreading else procs
541
-
542
- raw_julia_output = Main.EquationSearch(
543
- Main.X,
544
- Main.y,
545
- weights=Main.weights,
546
- niterations=int(niterations),
547
- varMap=(
548
- variable_names
549
- if selection is None
550
- else [variable_names[i] for i in selection]
551
- ),
552
- options=options,
553
- numprocs=int(cprocs),
554
- multithreading=bool(multithreading),
555
- )
556
-
557
- _set_globals(
558
- X=X,
559
- equation_file=equation_file,
560
- variable_names=variable_names,
561
- extra_sympy_mappings=extra_sympy_mappings,
562
- extra_torch_mappings=extra_torch_mappings,
563
- extra_jax_mappings=extra_jax_mappings,
564
- output_jax_format=output_jax_format,
565
- output_torch_format=output_torch_format,
566
- multioutput=multioutput,
567
- nout=nout,
568
- selection=selection,
569
- raw_julia_output=raw_julia_output,
570
- )
571
-
572
- equations = get_hof(
573
- equation_file=equation_file,
574
- n_features=X.shape[1],
575
- variable_names=variable_names,
576
- output_jax_format=output_jax_format,
577
- output_torch_format=output_torch_format,
578
- selection=selection,
579
- extra_sympy_mappings=extra_sympy_mappings,
580
- extra_jax_mappings=extra_jax_mappings,
581
- extra_torch_mappings=extra_torch_mappings,
582
- multioutput=multioutput,
583
- nout=nout,
584
  )
585
-
586
- if delete_tempfiles:
587
- shutil.rmtree(tmpdir)
588
-
589
- already_ran = True
590
-
591
- return equations
592
-
593
-
594
- def _set_globals(
595
- *,
596
- X,
597
- equation_file,
598
- variable_names,
599
- extra_sympy_mappings,
600
- extra_torch_mappings,
601
- extra_jax_mappings,
602
- output_jax_format,
603
- output_torch_format,
604
- multioutput,
605
- nout,
606
- selection,
607
- raw_julia_output,
608
- ):
609
- global global_state
610
-
611
- global_state["n_features"] = X.shape[1]
612
- global_state["equation_file"] = equation_file
613
- global_state["variable_names"] = variable_names
614
- global_state["extra_sympy_mappings"] = extra_sympy_mappings
615
- global_state["extra_torch_mappings"] = extra_torch_mappings
616
- global_state["extra_jax_mappings"] = extra_jax_mappings
617
- global_state["output_jax_format"] = output_jax_format
618
- global_state["output_torch_format"] = output_torch_format
619
- global_state["multioutput"] = multioutput
620
- global_state["nout"] = nout
621
- global_state["selection"] = selection
622
- global_state["raw_julia_output"] = raw_julia_output
623
 
624
 
625
  def _handle_constraints(binary_operators, unary_operators, constraints):
@@ -646,6 +117,7 @@ def _handle_constraints(binary_operators, unary_operators, constraints):
646
 
647
 
648
  def _create_inline_operators(binary_operators, unary_operators):
 
649
  for op_list in [binary_operators, unary_operators]:
650
  for i, op in enumerate(op_list):
651
  is_user_defined_operator = "(" in op
@@ -710,234 +182,35 @@ def run_feature_selection(X, y, select_k_features):
710
  return selector.get_support(indices=True)
711
 
712
 
713
- def get_hof(
714
- equation_file=None,
715
- n_features=None,
716
- variable_names=None,
717
- output_jax_format=None,
718
- output_torch_format=None,
719
- selection=None,
720
- extra_sympy_mappings=None,
721
- extra_jax_mappings=None,
722
- extra_torch_mappings=None,
723
- multioutput=None,
724
- nout=None,
725
- **kwargs,
726
- ):
727
- """Get the equations from a hall of fame file. If no arguments
728
- entered, the ones used previously from a call to PySR will be used."""
729
-
730
- global global_state
731
-
732
- if equation_file is None:
733
- equation_file = global_state["equation_file"]
734
- if n_features is None:
735
- n_features = global_state["n_features"]
736
- if variable_names is None:
737
- variable_names = global_state["variable_names"]
738
- if extra_sympy_mappings is None:
739
- extra_sympy_mappings = global_state["extra_sympy_mappings"]
740
- if extra_jax_mappings is None:
741
- extra_jax_mappings = global_state["extra_jax_mappings"]
742
- if extra_torch_mappings is None:
743
- extra_torch_mappings = global_state["extra_torch_mappings"]
744
- if output_torch_format is None:
745
- output_torch_format = global_state["output_torch_format"]
746
- if output_jax_format is None:
747
- output_jax_format = global_state["output_jax_format"]
748
- if multioutput is None:
749
- multioutput = global_state["multioutput"]
750
- if nout is None:
751
- nout = global_state["nout"]
752
- if selection is None:
753
- selection = global_state["selection"]
754
-
755
- global_state["selection"] = selection
756
- global_state["equation_file"] = equation_file
757
- global_state["n_features"] = n_features
758
- global_state["variable_names"] = variable_names
759
- global_state["extra_sympy_mappings"] = extra_sympy_mappings
760
- global_state["extra_jax_mappings"] = extra_jax_mappings
761
- global_state["extra_torch_mappings"] = extra_torch_mappings
762
- global_state["output_torch_format"] = output_torch_format
763
- global_state["output_jax_format"] = output_jax_format
764
- global_state["multioutput"] = multioutput
765
- global_state["nout"] = nout
766
- global_state["selection"] = selection
767
-
768
- try:
769
- if multioutput:
770
- all_outputs = [
771
- pd.read_csv(str(equation_file) + f".out{i}" + ".bkup", sep="|")
772
- for i in range(1, nout + 1)
773
- ]
774
- else:
775
- all_outputs = [pd.read_csv(str(equation_file) + ".bkup", sep="|")]
776
- except FileNotFoundError:
777
- raise RuntimeError(
778
- "Couldn't find equation file! The equation search likely exited before a single iteration completed."
779
- )
780
-
781
- ret_outputs = []
782
-
783
- for output in all_outputs:
784
-
785
- scores = []
786
- lastMSE = None
787
- lastComplexity = 0
788
- sympy_format = []
789
- lambda_format = []
790
- if output_jax_format:
791
- jax_format = []
792
- if output_torch_format:
793
- torch_format = []
794
- use_custom_variable_names = len(variable_names) != 0
795
- local_sympy_mappings = {**extra_sympy_mappings, **sympy_mappings}
796
-
797
- if use_custom_variable_names:
798
- sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
799
- else:
800
- sympy_symbols = [sympy.Symbol("x%d" % i) for i in range(n_features)]
801
-
802
- for _, eqn_row in output.iterrows():
803
- eqn = sympify(eqn_row["Equation"], locals=local_sympy_mappings)
804
- sympy_format.append(eqn)
805
-
806
- # Numpy:
807
- lambda_format.append(
808
- CallableEquation(sympy_symbols, eqn, selection, variable_names)
809
- )
810
-
811
- # JAX:
812
- if output_jax_format:
813
- from .export_jax import sympy2jax
814
-
815
- func, params = sympy2jax(
816
- eqn,
817
- sympy_symbols,
818
- selection=selection,
819
- extra_jax_mappings=extra_jax_mappings,
820
- )
821
- jax_format.append({"callable": func, "parameters": params})
822
-
823
- # Torch:
824
- if output_torch_format:
825
- from .export_torch import sympy2torch
826
-
827
- module = sympy2torch(
828
- eqn,
829
- sympy_symbols,
830
- selection=selection,
831
- extra_torch_mappings=extra_torch_mappings,
832
- )
833
- torch_format.append(module)
834
-
835
- curMSE = eqn_row["MSE"]
836
- curComplexity = eqn_row["Complexity"]
837
-
838
- if lastMSE is None:
839
- cur_score = 0.0
840
- else:
841
- if curMSE > 0.0:
842
- cur_score = -np.log(curMSE / lastMSE) / (
843
- curComplexity - lastComplexity
844
- )
845
- else:
846
- cur_score = np.inf
847
-
848
- scores.append(cur_score)
849
- lastMSE = curMSE
850
- lastComplexity = curComplexity
851
-
852
- output["score"] = np.array(scores)
853
- output["sympy_format"] = sympy_format
854
- output["lambda_format"] = lambda_format
855
- output_cols = [
856
- "Complexity",
857
- "MSE",
858
- "score",
859
- "Equation",
860
- "sympy_format",
861
- "lambda_format",
862
- ]
863
- if output_jax_format:
864
- output_cols += ["jax_format"]
865
- output["jax_format"] = jax_format
866
- if output_torch_format:
867
- output_cols += ["torch_format"]
868
- output["torch_format"] = torch_format
869
-
870
- ret_outputs.append(output[output_cols])
871
-
872
- if multioutput:
873
- return ret_outputs
874
- return ret_outputs[0]
875
-
876
-
877
- def best_row(equations=None):
878
- """Return the best row of a hall of fame file using the score column.
879
- By default this uses the last equation file.
880
- """
881
- if equations is None:
882
- equations = get_hof()
883
- if isinstance(equations, list):
884
- return [eq.iloc[np.argmax(eq["score"])] for eq in equations]
885
- return equations.iloc[np.argmax(equations["score"])]
886
-
887
-
888
- def best_tex(equations=None):
889
- """Return the equation with the best score, in latex format
890
- By default this uses the last equation file.
891
- """
892
- if equations is None:
893
- equations = get_hof()
894
- if isinstance(equations, list):
895
- return [
896
- sympy.latex(best_row(eq)["sympy_format"].simplify()) for eq in equations
897
- ]
898
- return sympy.latex(best_row(equations)["sympy_format"].simplify())
899
 
900
 
901
- def best(equations=None):
902
- """Return the equation with the best score, in sympy format.
903
- By default this uses the last equation file.
904
- """
905
- if equations is None:
906
- equations = get_hof()
907
- if isinstance(equations, list):
908
- return [best_row(eq)["sympy_format"].simplify() for eq in equations]
909
- return best_row(equations)["sympy_format"].simplify()
910
 
911
 
912
- def best_callable(equations=None):
913
- """Return the equation with the best score, in callable format.
914
- By default this uses the last equation file.
915
- """
916
- if equations is None:
917
- equations = get_hof()
918
- if isinstance(equations, list):
919
- return [best_row(eq)["lambda_format"] for eq in equations]
920
- return best_row(equations)["lambda_format"]
921
 
922
 
923
- def _escape_filename(filename):
924
- """Turns a file into a string representation with correctly escaped backslashes"""
925
- str_repr = str(filename)
926
- str_repr = str_repr.replace("\\", "\\\\")
927
- return str_repr
928
 
929
 
930
- # https://gist.github.com/garrettdreyfus/8153571
931
- def _yesno(question):
932
- """Simple Yes/No Function."""
933
- prompt = f"{question} (y/n): "
934
- ans = input(prompt).strip().lower()
935
- if ans not in ["y", "n"]:
936
- print(f"{ans} is invalid, please try again...")
937
- return _yesno(question)
938
- if ans == "y":
939
- return True
940
- return False
941
 
942
 
943
  def _denoise(X, y, Xresampled=None):
@@ -969,9 +242,9 @@ class CallableEquation:
969
 
970
  def __call__(self, X):
971
  if isinstance(X, pd.DataFrame):
972
- X = np.array(X[self._variable_names])
973
-
974
- if self._selection is not None:
975
  return self._lambda(*X[:, self._selection].T)
976
  return self._lambda(*X.T)
977
 
@@ -1053,3 +326,957 @@ julia = "1.5"
1053
 
1054
  project_toml_path = tmp_dir / "Project.toml"
1055
  project_toml_path.write_text(project_toml)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from datetime import datetime
12
  import warnings
13
  from multiprocessing import cpu_count
14
+ from sklearn.base import BaseEstimator, RegressorMixin
15
 
16
  is_julia_warning_silenced = False
17
 
18
 
19
  def install(julia_project=None): # pragma: no cover
20
+ """Install PyCall.jl and all required dependencies for SymbolicRegression.jl.
21
+
22
+ Also updates the local Julia registry."""
23
  import julia
24
 
25
  julia.install()
 
40
 
41
 
42
  Main = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  already_ran = False
45
 
 
83
  }
84
 
85
 
86
+ def pysr(X, y, weights=None, **kwargs):
87
+ warnings.warn(
88
+ "Calling `pysr` is deprecated. Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.",
89
+ DeprecationWarning,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
+ model = PySRRegressor(**kwargs)
92
+ model.fit(X, y, weights=weights)
93
+ return model.equations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def _handle_constraints(binary_operators, unary_operators, constraints):
 
117
 
118
 
119
  def _create_inline_operators(binary_operators, unary_operators):
120
+ global Main
121
  for op_list in [binary_operators, unary_operators]:
122
  for i, op in enumerate(op_list):
123
  is_user_defined_operator = "(" in op
 
182
  return selector.get_support(indices=True)
183
 
184
 
185
+ def _escape_filename(filename):
186
+ """Turns a file into a string representation with correctly escaped backslashes"""
187
+ str_repr = str(filename)
188
+ str_repr = str_repr.replace("\\", "\\\\")
189
+ return str_repr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
 
192
+ def best(*args, **kwargs):
193
+ raise NotImplementedError(
194
+ "`best` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.sympy()` to get the sympy representation of the best equation."
195
+ )
 
 
 
 
 
196
 
197
 
198
+ def best_row(*args, **kwargs):
199
+ raise NotImplementedError(
200
+ "`best_row` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can run `print(model)` to view the best equation."
201
+ )
 
 
 
 
 
202
 
203
 
204
+ def best_tex(*args, **kwargs):
205
+ raise NotImplementedError(
206
+ "`best_tex` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can return `.latex()` to get the sympy representation of the best equation."
207
+ )
 
208
 
209
 
210
+ def best_callable(*args, **kwargs):
211
+ raise NotImplementedError(
212
+ "`best_callable` has been deprecated. Please use the `PySRRegressor` interface. After fitting, you can use `.predict(X)` to use the best callable."
213
+ )
 
 
 
 
 
 
 
214
 
215
 
216
  def _denoise(X, y, Xresampled=None):
 
242
 
243
  def __call__(self, X):
244
  if isinstance(X, pd.DataFrame):
245
+ # Lambda function takes as argument:
246
+ return self._lambda(**{k: X[k].values for k in X.columns})
247
+ elif self._selection is not None:
248
  return self._lambda(*X[:, self._selection].T)
249
  return self._lambda(*X.T)
250
 
 
326
 
327
  project_toml_path = tmp_dir / "Project.toml"
328
  project_toml_path.write_text(project_toml)
329
+
330
+
331
+ class PySRRegressor(BaseEstimator, RegressorMixin):
332
+ def __init__(
333
+ self,
334
+ model_selection="best",
335
+ weights=None,
336
+ binary_operators=None,
337
+ unary_operators=None,
338
+ procs=cpu_count(),
339
+ loss="L2DistLoss()",
340
+ populations=20,
341
+ niterations=100,
342
+ ncyclesperiteration=300,
343
+ alpha=0.1,
344
+ annealing=False,
345
+ fractionReplaced=0.10,
346
+ fractionReplacedHof=0.10,
347
+ npop=1000,
348
+ parsimony=1e-4,
349
+ migration=True,
350
+ hofMigration=True,
351
+ shouldOptimizeConstants=True,
352
+ topn=10,
353
+ weightAddNode=1,
354
+ weightInsertNode=3,
355
+ weightDeleteNode=3,
356
+ weightDoNothing=1,
357
+ weightMutateConstant=10,
358
+ weightMutateOperator=1,
359
+ weightRandomize=1,
360
+ weightSimplify=0.002,
361
+ perturbationFactor=1.0,
362
+ extra_sympy_mappings=None,
363
+ extra_torch_mappings=None,
364
+ extra_jax_mappings=None,
365
+ equation_file=None,
366
+ verbosity=1e9,
367
+ progress=None,
368
+ maxsize=20,
369
+ fast_cycle=False,
370
+ maxdepth=None,
371
+ variable_names=None,
372
+ batching=False,
373
+ batchSize=50,
374
+ select_k_features=None,
375
+ warmupMaxsizeBy=0.0,
376
+ constraints=None,
377
+ useFrequency=True,
378
+ tempdir=None,
379
+ delete_tempfiles=True,
380
+ julia_project=None,
381
+ update=True,
382
+ temp_equation_file=False,
383
+ output_jax_format=False,
384
+ output_torch_format=False,
385
+ optimizer_algorithm="BFGS",
386
+ optimizer_nrestarts=3,
387
+ optimize_probability=1.0,
388
+ optimizer_iterations=10,
389
+ tournament_selection_n=10,
390
+ tournament_selection_p=1.0,
391
+ denoise=False,
392
+ Xresampled=None,
393
+ precision=32,
394
+ multithreading=None,
395
+ **kwargs,
396
+ ):
397
+ """Initialize settings for an equation search in PySR.
398
+
399
+ Note: most default parameters have been tuned over several example
400
+ equations, but you should adjust `niterations`,
401
+ `binary_operators`, `unary_operators` to your requirements.
402
+ You can view more detailed explanations of the options on the
403
+ [options page](https://pysr.readthedocs.io/en/latest/docs/options/) of the documentation.
404
+
405
+ :param model_selection: How to select a model. Can be 'accuracy' or 'best'. The default, 'best', will optimize a combination of complexity and accuracy.
406
+ :type model_selection: str
407
+ :param binary_operators: List of strings giving the binary operators in Julia's Base. Default is ["+", "-", "*", "/",].
408
+ :type binary_operators: list
409
+ :param unary_operators: Same but for operators taking a single scalar. Default is [].
410
+ :type unary_operators: list
411
+ :param niterations: Number of iterations of the algorithm to run. The best equations are printed, and migrate between populations, at the end of each.
412
+ :type niterations: int
413
+ :param populations: Number of populations running.
414
+ :type populations: int
415
+ :param loss: String of Julia code specifying the loss function. Can either be a loss from LossFunctions.jl, or your own loss written as a function. Examples of custom written losses include: `myloss(x, y) = abs(x-y)` for non-weighted, or `myloss(x, y, w) = w*abs(x-y)` for weighted. Among the included losses, these are as follows. Regression: `LPDistLoss{P}()`, `L1DistLoss()`, `L2DistLoss()` (mean square), `LogitDistLoss()`, `HuberLoss(d)`, `L1EpsilonInsLoss(ϵ)`, `L2EpsilonInsLoss(ϵ)`, `PeriodicLoss(c)`, `QuantileLoss(τ)`. Classification: `ZeroOneLoss()`, `PerceptronLoss()`, `L1HingeLoss()`, `SmoothedL1HingeLoss(γ)`, `ModifiedHuberLoss()`, `L2MarginLoss()`, `ExpLoss()`, `SigmoidLoss()`, `DWDMarginLoss(q)`.
416
+ :type loss: str
417
+ :param denoise: Whether to use a Gaussian Process to denoise the data before inputting to PySR. Can help PySR fit noisy data.
418
+ :type denoise: bool
419
+ :param select_k_features: whether to run feature selection in Python using random forests, before passing to the symbolic regression code. None means no feature selection; an int means select that many features.
420
+ :type select_k_features: None/int
421
+ :param procs: Number of processes (=number of populations running).
422
+ :type procs: int
423
+ :param multithreading: Use multithreading instead of distributed backend. Default is yes. Using procs=0 will turn off both.
424
+ :type multithreading: bool
425
+ :param batching: whether to compare population members on small batches during evolution. Still uses full dataset for comparing against hall of fame.
426
+ :type batching: bool
427
+ :param batchSize: the amount of data to use if doing batching.
428
+ :type batchSize: int
429
+ :param maxsize: Max size of an equation.
430
+ :type maxsize: int
431
+ :param ncyclesperiteration: Number of total mutations to run, per 10 samples of the population, per iteration.
432
+ :type ncyclesperiteration: int
433
+ :param alpha: Initial temperature.
434
+ :type alpha: float
435
+ :param annealing: Whether to use annealing. You should (and it is default).
436
+ :type annealing: bool
437
+ :param fractionReplaced: How much of population to replace with migrating equations from other populations.
438
+ :type fractionReplaced: float
439
+ :param fractionReplacedHof: How much of population to replace with migrating equations from hall of fame.
440
+ :type fractionReplacedHof: float
441
+ :param npop: Number of individuals in each population
442
+ :type npop: int
443
+ :param parsimony: Multiplicative factor for how much to punish complexity.
444
+ :type parsimony: float
445
+ :param migration: Whether to migrate.
446
+ :type migration: bool
447
+ :param hofMigration: Whether to have the hall of fame migrate.
448
+ :type hofMigration: bool
449
+ :param shouldOptimizeConstants: Whether to numerically optimize constants (Nelder-Mead/Newton) at the end of each iteration.
450
+ :type shouldOptimizeConstants: bool
451
+ :param topn: How many top individuals migrate from each population.
452
+ :type topn: int
453
+ :param perturbationFactor: Constants are perturbed by a max factor of (perturbationFactor*T + 1). Either multiplied by this or divided by this.
454
+ :type perturbationFactor: float
455
+ :param weightAddNode: Relative likelihood for mutation to add a node
456
+ :type weightAddNode: float
457
+ :param weightInsertNode: Relative likelihood for mutation to insert a node
458
+ :type weightInsertNode: float
459
+ :param weightDeleteNode: Relative likelihood for mutation to delete a node
460
+ :type weightDeleteNode: float
461
+ :param weightDoNothing: Relative likelihood for mutation to leave the individual
462
+ :type weightDoNothing: float
463
+ :param weightMutateConstant: Relative likelihood for mutation to change the constant slightly in a random direction.
464
+ :type weightMutateConstant: float
465
+ :param weightMutateOperator: Relative likelihood for mutation to swap an operator.
466
+ :type weightMutateOperator: float
467
+ :param weightRandomize: Relative likelihood for mutation to completely delete and then randomly generate the equation
468
+ :type weightRandomize: float
469
+ :param weightSimplify: Relative likelihood for mutation to simplify constant parts by evaluation
470
+ :type weightSimplify: float
471
+ :param equation_file: Where to save the files (.csv separated by |)
472
+ :type equation_file: str
473
+ :param verbosity: What verbosity level to use. 0 means minimal print statements.
474
+ :type verbosity: int
475
+ :param progress: Whether to use a progress bar instead of printing to stdout.
476
+ :type progress: bool
477
+ :param maxdepth: Max depth of an equation. You can use both maxsize and maxdepth. maxdepth is by default set to = maxsize, which means that it is redundant.
478
+ :type maxdepth: int
479
+ :param fast_cycle: (experimental) - batch over population subsamples. This is a slightly different algorithm than regularized evolution, but does cycles 15% faster. May be algorithmically less efficient.
480
+ :type fast_cycle: bool
481
+ :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
482
+ :type variable_names: list
483
+ :param warmupMaxsizeBy: whether to slowly increase max size from a small number up to the maxsize (if greater than 0). If greater than 0, says the fraction of training time at which the current maxsize will reach the user-passed maxsize.
484
+ :type warmupMaxsizeBy: float
485
+ :param constraints: dictionary of int (unary) or 2-tuples (binary), this enforces maxsize constraints on the individual arguments of operators. E.g., `'pow': (-1, 1)` says that power laws can have any complexity left argument, but only 1 complexity exponent. Use this to force more interpretable solutions.
486
+ :type constraints: dict
487
+ :param useFrequency: whether to measure the frequency of complexities, and use that instead of parsimony to explore equation space. Will naturally find equations of all complexities.
488
+ :type useFrequency: bool
489
+ :param tempdir: directory for the temporary files
490
+ :type tempdir: str/None
491
+ :param delete_tempfiles: whether to delete the temporary files after finishing
492
+ :type delete_tempfiles: bool
493
+ :param julia_project: a Julia environment location containing a Project.toml (and potentially the source code for SymbolicRegression.jl). Default gives the Python package directory, where a Project.toml file should be present from the install.
494
+ :type julia_project: str/None
495
+ :param update: Whether to automatically update Julia packages.
496
+ :type update: bool
497
+ :param temp_equation_file: Whether to put the hall of fame file in the temp directory. Deletion is then controlled with the delete_tempfiles argument.
498
+ :type temp_equation_file: bool
499
+ :param output_jax_format: Whether to create a 'jax_format' column in the output, containing jax-callable functions and the default parameters in a jax array.
500
+ :type output_jax_format: bool
501
+ :param output_torch_format: Whether to create a 'torch_format' column in the output, containing a torch module with trainable parameters.
502
+ :type output_torch_format: bool
503
+ :param tournament_selection_n: Number of expressions to consider in each tournament.
504
+ :type tournament_selection_n: int
505
+ :param tournament_selection_p: Probability of selecting the best expression in each tournament. The probability will decay as p*(1-p)^n for other expressions, sorted by loss.
506
+ :type tournament_selection_p: float
507
+ :param precision: What precision to use for the data. By default this is 32 (float32), but you can select 64 or 16 as well.
508
+ :type precision: int
509
+ :param **kwargs: Other options passed to SymbolicRegression.Options, for example, if you modify SymbolicRegression.jl to include additional arguments.
510
+ :type **kwargs: dict
511
+ :returns: Results dataframe, giving complexity, MSE, and equations (as strings), as well as functional forms. If list, each element corresponds to a dataframe of equations for each output.
512
+ :type: pd.DataFrame/list
513
+ """
514
+ super().__init__()
515
+ self.model_selection = model_selection
516
+
517
+ if binary_operators is None:
518
+ binary_operators = "+ * - /".split(" ")
519
+ if unary_operators is None:
520
+ unary_operators = []
521
+ if extra_sympy_mappings is None:
522
+ extra_sympy_mappings = {}
523
+ if variable_names is None:
524
+ variable_names = []
525
+ if constraints is None:
526
+ constraints = {}
527
+ if multithreading is None:
528
+ # Default is multithreading=True, unless explicitly set,
529
+ # or procs is set to 0 (serial mode).
530
+ multithreading = procs != 0
531
+
532
+ buffer_available = "buffer" in sys.stdout.__dir__()
533
+
534
+ if progress is not None:
535
+ if progress and not buffer_available:
536
+ warnings.warn(
537
+ "Note: it looks like you are running in Jupyter. The progress bar will be turned off."
538
+ )
539
+ progress = False
540
+ else:
541
+ progress = buffer_available
542
+
543
+ assert optimizer_algorithm in ["NelderMead", "BFGS"]
544
+ assert tournament_selection_n < npop
545
+
546
+ if extra_jax_mappings is not None:
547
+ for value in extra_jax_mappings.values():
548
+ if not isinstance(value, str):
549
+ raise NotImplementedError(
550
+ "extra_jax_mappings must have keys that are strings! e.g., {sympy.sqrt: 'jnp.sqrt'}."
551
+ )
552
+ else:
553
+ extra_jax_mappings = {}
554
+
555
+ if extra_torch_mappings is not None:
556
+ for value in extra_jax_mappings.values():
557
+ if not callable(value):
558
+ raise NotImplementedError(
559
+ "extra_torch_mappings must be callable functions! e.g., {sympy.sqrt: torch.sqrt}."
560
+ )
561
+ else:
562
+ extra_torch_mappings = {}
563
+
564
+ if maxsize > 40:
565
+ warnings.warn(
566
+ "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory. You should consider turning `useFrequency` to False, and perhaps use `warmupMaxsizeBy`."
567
+ )
568
+ elif maxsize < 7:
569
+ raise NotImplementedError("PySR requires a maxsize of at least 7")
570
+
571
+ if maxdepth is None:
572
+ maxdepth = maxsize
573
+
574
+ if isinstance(binary_operators, str):
575
+ binary_operators = [binary_operators]
576
+ if isinstance(unary_operators, str):
577
+ unary_operators = [unary_operators]
578
+
579
+ self.params = {
580
+ **dict(
581
+ weights=weights,
582
+ binary_operators=binary_operators,
583
+ unary_operators=unary_operators,
584
+ procs=procs,
585
+ loss=loss,
586
+ populations=populations,
587
+ niterations=niterations,
588
+ ncyclesperiteration=ncyclesperiteration,
589
+ alpha=alpha,
590
+ annealing=annealing,
591
+ fractionReplaced=fractionReplaced,
592
+ fractionReplacedHof=fractionReplacedHof,
593
+ npop=npop,
594
+ parsimony=float(parsimony),
595
+ migration=migration,
596
+ hofMigration=hofMigration,
597
+ shouldOptimizeConstants=shouldOptimizeConstants,
598
+ topn=topn,
599
+ weightAddNode=weightAddNode,
600
+ weightInsertNode=weightInsertNode,
601
+ weightDeleteNode=weightDeleteNode,
602
+ weightDoNothing=weightDoNothing,
603
+ weightMutateConstant=weightMutateConstant,
604
+ weightMutateOperator=weightMutateOperator,
605
+ weightRandomize=weightRandomize,
606
+ weightSimplify=weightSimplify,
607
+ perturbationFactor=perturbationFactor,
608
+ verbosity=verbosity,
609
+ progress=progress,
610
+ maxsize=maxsize,
611
+ fast_cycle=fast_cycle,
612
+ maxdepth=maxdepth,
613
+ batching=batching,
614
+ batchSize=batchSize,
615
+ select_k_features=select_k_features,
616
+ warmupMaxsizeBy=warmupMaxsizeBy,
617
+ constraints=constraints,
618
+ useFrequency=useFrequency,
619
+ tempdir=tempdir,
620
+ delete_tempfiles=delete_tempfiles,
621
+ update=update,
622
+ temp_equation_file=temp_equation_file,
623
+ optimizer_algorithm=optimizer_algorithm,
624
+ optimizer_nrestarts=optimizer_nrestarts,
625
+ optimize_probability=optimize_probability,
626
+ optimizer_iterations=optimizer_iterations,
627
+ tournament_selection_n=tournament_selection_n,
628
+ tournament_selection_p=tournament_selection_p,
629
+ denoise=denoise,
630
+ Xresampled=Xresampled,
631
+ precision=precision,
632
+ multithreading=multithreading,
633
+ ),
634
+ **kwargs,
635
+ }
636
+
637
+ # Stored equations:
638
+ self.equations = None
639
+
640
+ self.multioutput = None
641
+ self.raw_julia_output = None
642
+ self.equation_file = equation_file
643
+ self.n_features = None
644
+ self.extra_sympy_mappings = extra_sympy_mappings
645
+ self.extra_torch_mappings = extra_torch_mappings
646
+ self.extra_jax_mappings = extra_jax_mappings
647
+ self.output_jax_format = output_jax_format
648
+ self.output_torch_format = output_torch_format
649
+ self.nout = 1
650
+ self.selection = None
651
+ self.variable_names = variable_names
652
+ self.julia_project = julia_project
653
+
654
+ self.surface_parameters = [
655
+ "model_selection",
656
+ "multioutput",
657
+ "raw_julia_output",
658
+ "equation_file",
659
+ "n_features",
660
+ "extra_sympy_mappings",
661
+ "extra_torch_mappings",
662
+ "extra_jax_mappings",
663
+ "output_jax_format",
664
+ "output_torch_format",
665
+ "nout",
666
+ "selection",
667
+ "variable_names",
668
+ "julia_project",
669
+ ]
670
+
671
+ def __repr__(self):
672
+ """Prints all current equations fitted by the model.
673
+
674
+ The string `>>>>` denotes which equation is selected by the
675
+ `model_selection`.
676
+ """
677
+ if self.equations is None:
678
+ return "PySRRegressor.equations = None"
679
+
680
+ output = "PySRRegressor.equations = [\n"
681
+
682
+ equations = self.equations
683
+ if not isinstance(equations, list):
684
+ all_equations = [equations]
685
+ else:
686
+ all_equations = equations
687
+
688
+ for i, equations in enumerate(all_equations):
689
+ selected = ["" for _ in range(len(equations))]
690
+ if self.model_selection == "accuracy":
691
+ chosen_row = -1
692
+ elif self.model_selection == "best":
693
+ chosen_row = equations["score"].idxmax()
694
+ else:
695
+ raise NotImplementedError
696
+ selected[chosen_row] = ">>>>"
697
+ repr_equations = pd.DataFrame(
698
+ dict(
699
+ pick=selected,
700
+ score=equations["score"],
701
+ equation=equations["equation"],
702
+ loss=equations["loss"],
703
+ complexity=equations["complexity"],
704
+ )
705
+ )
706
+
707
+ if len(all_equations) > 1:
708
+ output += "[\n"
709
+
710
+ for line in repr_equations.__repr__().split("\n"):
711
+ output += "\t" + line + "\n"
712
+
713
+ if len(all_equations) > 1:
714
+ output += "]"
715
+
716
+ if i < len(all_equations) - 1:
717
+ output += ", "
718
+
719
+ output += "]"
720
+ return output
721
+
722
+ def set_params(self, **params):
723
+ """Set parameters for equation search."""
724
+ for key, value in params.items():
725
+ if key in self.surface_parameters:
726
+ self.__setattr__(key, value)
727
+ else:
728
+ self.params[key] = value
729
+
730
+ self.refresh()
731
+ return self
732
+
733
+ def get_params(self, deep=True):
734
+ """Get parameters for equation search."""
735
+ del deep
736
+ return {
737
+ **self.params,
738
+ **{key: self.__getattribute__(key) for key in self.surface_parameters},
739
+ }
740
+
741
+ def get_best(self):
742
+ """Get best equation using `model_selection`."""
743
+ if self.equations is None:
744
+ raise ValueError("No equations have been generated yet.")
745
+ if self.model_selection == "accuracy":
746
+ if isinstance(self.equations, list):
747
+ return [eq.iloc[-1] for eq in self.equations]
748
+ return self.equations.iloc[-1]
749
+ elif self.model_selection == "best":
750
+ if isinstance(self.equations, list):
751
+ return [eq.iloc[eq["score"].idxmax()] for eq in self.equations]
752
+ return self.equations.iloc[self.equations["score"].idxmax()]
753
+ else:
754
+ raise NotImplementedError(
755
+ f"{self.model_selection} is not a valid model selection strategy."
756
+ )
757
+
758
+ def fit(self, X, y, weights=None, variable_names=None):
759
+ """Search for equations to fit the dataset and store them in `self.equations`.
760
+
761
+ :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
762
+ :type X: np.ndarray/pandas.DataFrame
763
+ :param y: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs). Putting in a 2D array will trigger a search for equations for each feature of y.
764
+ :type y: np.ndarray
765
+ :param weights: Optional. Same shape as y. Each element is how to weight the mean-square-error loss for that particular element of y.
766
+ :type weights: np.ndarray
767
+ :param variable_names: a list of names for the variables, other than "x0", "x1", etc.
768
+ You can also pass a pandas DataFrame for X.
769
+ :type variable_names: list
770
+ """
771
+ if variable_names is None:
772
+ variable_names = self.variable_names
773
+
774
+ self._run(
775
+ X=X,
776
+ y=y,
777
+ weights=weights,
778
+ variable_names=variable_names,
779
+ )
780
+
781
+ return self
782
+
783
+ def refresh(self):
784
+ # Updates self.equations with any new options passed,
785
+ # such as extra_sympy_mappings.
786
+ self.equations = self.get_hof()
787
+
788
+ def predict(self, X):
789
+ """Predict y from input X using the equation chosen by `model_selection`.
790
+
791
+ You may see what equation is used by printing this object. X should have the same
792
+ columns as the training data.
793
+
794
+ :param X: 2D array. Rows are examples, columns are features. If pandas DataFrame, the columns are used for variable names (so make sure they don't contain spaces).
795
+ :type X: np.ndarray/pandas.DataFrame
796
+ :return: 1D array (rows are examples) or 2D array (rows are examples, columns are outputs).
797
+ """
798
+ self.refresh()
799
+ best = self.get_best()
800
+ if self.multioutput:
801
+ return np.stack([eq["lambda_format"](X) for eq in best], axis=1)
802
+ return best["lambda_format"](X)
803
+
804
+ def sympy(self):
805
+ """Return sympy representation of the equation(s) chosen by `model_selection`."""
806
+ self.refresh()
807
+ best = self.get_best()
808
+ if self.multioutput:
809
+ return [eq["sympy_format"] for eq in best]
810
+ return best["sympy_format"]
811
+
812
+ def latex(self):
813
+ """Return latex representation of the equation(s) chosen by `model_selection`."""
814
+ self.refresh()
815
+ sympy_representation = self.sympy()
816
+ if self.multioutput:
817
+ return [sympy.latex(s) for s in sympy_representation]
818
+ return sympy.latex(sympy_representation)
819
+
820
+ def jax(self):
821
+ """Return jax representation of the equation(s) chosen by `model_selection`.
822
+
823
+ Each equation (multiple given if there are multiple outputs) is a dictionary
824
+ containing {"callable": func, "parameters": params}. To call `func`, pass
825
+ func(X, params). This function is differentiable using `jax.grad`.
826
+ """
827
+ if self.using_pandas:
828
+ warnings.warn(
829
+ "PySR's JAX modules are not set up to work with a "
830
+ "model that was trained on pandas dataframes. "
831
+ "Train on an array instead to ensure everything works as planned."
832
+ )
833
+ self.set_params(output_jax_format=True)
834
+ self.refresh()
835
+ best = self.get_best()
836
+ if self.multioutput:
837
+ return [eq["jax_format"] for eq in best]
838
+ return best["jax_format"]
839
+
840
+ def pytorch(self):
841
+ """Return pytorch representation of the equation(s) chosen by `model_selection`.
842
+
843
+ Each equation (multiple given if there are multiple outputs) is a PyTorch module
844
+ containing the parameters as trainable attributes. You can use the module like
845
+ any other PyTorch module: `module(X)`, where `X` is a tensor with the same
846
+ column ordering as trained with.
847
+ """
848
+ if self.using_pandas:
849
+ warnings.warn(
850
+ "PySR's PyTorch modules are not set up to work with a "
851
+ "model that was trained on pandas dataframes. "
852
+ "Train on an array instead to ensure everything works as planned."
853
+ )
854
+ self.set_params(output_torch_format=True)
855
+ self.refresh()
856
+ best = self.get_best()
857
+ if self.multioutput:
858
+ return [eq["torch_format"] for eq in best]
859
+ return best["torch_format"]
860
+
861
+ def _run(self, X, y, weights, variable_names):
862
+ global already_ran
863
+ global Main
864
+
865
+ for key in self.surface_parameters:
866
+ if key in self.params:
867
+ raise ValueError(
868
+ f"{key} is a surface parameter, and cannot be in self.params"
869
+ )
870
+
871
+ multithreading = self.params["multithreading"]
872
+ procs = self.params["procs"]
873
+ binary_operators = self.params["binary_operators"]
874
+ unary_operators = self.params["unary_operators"]
875
+ batching = self.params["batching"]
876
+ maxsize = self.params["maxsize"]
877
+ select_k_features = self.params["select_k_features"]
878
+ Xresampled = self.params["Xresampled"]
879
+ denoise = self.params["denoise"]
880
+ constraints = self.params["constraints"]
881
+ update = self.params["update"]
882
+ loss = self.params["loss"]
883
+ weightMutateConstant = self.params["weightMutateConstant"]
884
+ weightMutateOperator = self.params["weightMutateOperator"]
885
+ weightAddNode = self.params["weightAddNode"]
886
+ weightInsertNode = self.params["weightInsertNode"]
887
+ weightDeleteNode = self.params["weightDeleteNode"]
888
+ weightSimplify = self.params["weightSimplify"]
889
+ weightRandomize = self.params["weightRandomize"]
890
+ weightDoNothing = self.params["weightDoNothing"]
891
+
892
+ if Main is None:
893
+ if multithreading:
894
+ os.environ["JULIA_NUM_THREADS"] = str(procs)
895
+
896
+ Main = init_julia()
897
+
898
+ if isinstance(X, pd.DataFrame):
899
+ if variable_names is not None:
900
+ warnings.warn("Resetting variable_names from X.columns")
901
+
902
+ variable_names = list(X.columns)
903
+ X = np.array(X)
904
+ self.using_pandas = True
905
+ else:
906
+ self.using_pandas = False
907
+
908
+ if len(X.shape) == 1:
909
+ X = X[:, None]
910
+
911
+ assert not isinstance(y, pd.DataFrame)
912
+
913
+ if len(variable_names) == 0:
914
+ variable_names = [f"x{i}" for i in range(X.shape[1])]
915
+
916
+ use_custom_variable_names = len(variable_names) != 0
917
+ # TODO: this is always true.
918
+
919
+ _check_assertions(
920
+ X,
921
+ binary_operators,
922
+ unary_operators,
923
+ use_custom_variable_names,
924
+ variable_names,
925
+ weights,
926
+ y,
927
+ )
928
+
929
+ self.n_features = X.shape[1]
930
+
931
+ if len(X) > 10000 and not batching:
932
+ warnings.warn(
933
+ "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://pysr.readthedocs.io/en/latest/docs/options/#batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed."
934
+ )
935
+
936
+ X, selection = _handle_feature_selection(
937
+ X, select_k_features, y, variable_names
938
+ )
939
+
940
+ if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
941
+ self.multioutput = False
942
+ self.nout = 1
943
+ y = y.reshape(-1)
944
+ elif len(y.shape) == 2:
945
+ self.multioutput = True
946
+ self.nout = y.shape[1]
947
+ else:
948
+ raise NotImplementedError("y shape not supported!")
949
+
950
+ if denoise:
951
+ if weights is not None:
952
+ raise NotImplementedError(
953
+ "No weights for denoising - the weights are learned."
954
+ )
955
+ if Xresampled is not None:
956
+ # Select among only the selected features:
957
+ if isinstance(Xresampled, pd.DataFrame):
958
+ # Handle Xresampled is pandas dataframe
959
+ if selection is not None:
960
+ Xresampled = Xresampled[[variable_names[i] for i in selection]]
961
+ else:
962
+ Xresampled = Xresampled[variable_names]
963
+ Xresampled = np.array(Xresampled)
964
+ else:
965
+ if selection is not None:
966
+ Xresampled = Xresampled[:, selection]
967
+ if self.multioutput:
968
+ y = np.stack(
969
+ [
970
+ _denoise(X, y[:, i], Xresampled=Xresampled)[1]
971
+ for i in range(self.nout)
972
+ ],
973
+ axis=1,
974
+ )
975
+ if Xresampled is not None:
976
+ X = Xresampled
977
+ else:
978
+ X, y = _denoise(X, y, Xresampled=Xresampled)
979
+
980
+ self.julia_project = _get_julia_project(self.julia_project)
981
+
982
+ tmpdir = Path(tempfile.mkdtemp(dir=self.params["tempdir"]))
983
+
984
+ if self.params["temp_equation_file"]:
985
+ self.equation_file = tmpdir / "hall_of_fame.csv"
986
+ elif self.equation_file is None:
987
+ date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
988
+ self.equation_file = "hall_of_fame_" + date_time + ".csv"
989
+
990
+ _create_inline_operators(
991
+ binary_operators=binary_operators, unary_operators=unary_operators
992
+ )
993
+ _handle_constraints(
994
+ binary_operators=binary_operators,
995
+ unary_operators=unary_operators,
996
+ constraints=constraints,
997
+ )
998
+
999
+ una_constraints = [constraints[op] for op in unary_operators]
1000
+ bin_constraints = [constraints[op] for op in binary_operators]
1001
+
1002
+ try:
1003
+ # TODO: is this needed since Julia now prints directly to stdout?
1004
+ term_width = shutil.get_terminal_size().columns
1005
+ except:
1006
+ _, term_width = subprocess.check_output(["stty", "size"]).split()
1007
+
1008
+ if not already_ran:
1009
+ from julia import Pkg
1010
+
1011
+ Pkg.activate(f"{_escape_filename(self.julia_project)}")
1012
+ try:
1013
+ if update:
1014
+ Pkg.resolve()
1015
+ Pkg.instantiate()
1016
+ else:
1017
+ Pkg.instantiate()
1018
+ except RuntimeError as e:
1019
+ raise ImportError(
1020
+ f"""
1021
+ Required dependencies are not installed or built. Run the following code in the Python REPL:
1022
+
1023
+ >>> import pysr
1024
+ >>> pysr.install()
1025
+
1026
+ Tried to activate project {self.julia_project} but failed."""
1027
+ ) from e
1028
+ Main.eval("using SymbolicRegression")
1029
+
1030
+ Main.plus = Main.eval("(+)")
1031
+ Main.sub = Main.eval("(-)")
1032
+ Main.mult = Main.eval("(*)")
1033
+ Main.pow = Main.eval("(^)")
1034
+ Main.div = Main.eval("(/)")
1035
+
1036
+ Main.custom_loss = Main.eval(loss)
1037
+
1038
+ mutationWeights = [
1039
+ float(weightMutateConstant),
1040
+ float(weightMutateOperator),
1041
+ float(weightAddNode),
1042
+ float(weightInsertNode),
1043
+ float(weightDeleteNode),
1044
+ float(weightSimplify),
1045
+ float(weightRandomize),
1046
+ float(weightDoNothing),
1047
+ ]
1048
+
1049
+ options = Main.Options(
1050
+ binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
1051
+ unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
1052
+ bin_constraints=bin_constraints,
1053
+ una_constraints=una_constraints,
1054
+ loss=Main.custom_loss,
1055
+ maxsize=int(maxsize),
1056
+ hofFile=_escape_filename(self.equation_file),
1057
+ npopulations=int(self.params["populations"]),
1058
+ batching=batching,
1059
+ batchSize=int(
1060
+ min([self.params["batchSize"], len(X)]) if batching else len(X)
1061
+ ),
1062
+ mutationWeights=mutationWeights,
1063
+ terminal_width=int(term_width),
1064
+ probPickFirst=self.params["tournament_selection_p"],
1065
+ ns=self.params["tournament_selection_n"],
1066
+ # These have the same name:
1067
+ parsimony=self.params["parsimony"],
1068
+ alpha=self.params["alpha"],
1069
+ maxdepth=self.params["maxdepth"],
1070
+ fast_cycle=self.params["fast_cycle"],
1071
+ migration=self.params["migration"],
1072
+ hofMigration=self.params["hofMigration"],
1073
+ fractionReplacedHof=self.params["fractionReplacedHof"],
1074
+ shouldOptimizeConstants=self.params["shouldOptimizeConstants"],
1075
+ warmupMaxsizeBy=self.params["warmupMaxsizeBy"],
1076
+ useFrequency=self.params["useFrequency"],
1077
+ npop=self.params["npop"],
1078
+ ncyclesperiteration=self.params["ncyclesperiteration"],
1079
+ fractionReplaced=self.params["fractionReplaced"],
1080
+ topn=self.params["topn"],
1081
+ verbosity=self.params["verbosity"],
1082
+ optimizer_algorithm=self.params["optimizer_algorithm"],
1083
+ optimizer_nrestarts=self.params["optimizer_nrestarts"],
1084
+ optimize_probability=self.params["optimize_probability"],
1085
+ optimizer_iterations=self.params["optimizer_iterations"],
1086
+ perturbationFactor=self.params["perturbationFactor"],
1087
+ annealing=self.params["annealing"],
1088
+ )
1089
+
1090
+ np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[
1091
+ self.params["precision"]
1092
+ ]
1093
+
1094
+ Main.X = np.array(X, dtype=np_dtype).T
1095
+ if len(y.shape) == 1:
1096
+ Main.y = np.array(y, dtype=np_dtype)
1097
+ else:
1098
+ Main.y = np.array(y, dtype=np_dtype).T
1099
+ if weights is not None:
1100
+ if len(weights.shape) == 1:
1101
+ Main.weights = np.array(weights, dtype=np_dtype)
1102
+ else:
1103
+ Main.weights = np.array(weights, dtype=np_dtype).T
1104
+ else:
1105
+ Main.weights = None
1106
+
1107
+ cprocs = 0 if multithreading else procs
1108
+
1109
+ self.raw_julia_output = Main.EquationSearch(
1110
+ Main.X,
1111
+ Main.y,
1112
+ weights=Main.weights,
1113
+ niterations=int(self.params["niterations"]),
1114
+ varMap=(
1115
+ variable_names
1116
+ if selection is None
1117
+ else [variable_names[i] for i in selection]
1118
+ ),
1119
+ options=options,
1120
+ numprocs=int(cprocs),
1121
+ multithreading=bool(multithreading),
1122
+ )
1123
+
1124
+ self.variable_names = variable_names
1125
+ self.selection = selection
1126
+
1127
+ # Not in params:
1128
+ # selection, variable_names, multioutput
1129
+
1130
+ self.equations = self.get_hof()
1131
+
1132
+ if self.params["delete_tempfiles"]:
1133
+ shutil.rmtree(tmpdir)
1134
+
1135
+ already_ran = True
1136
+
1137
+ def get_hof(self):
1138
+ """Get the equations from a hall of fame file. If no arguments
1139
+ entered, the ones used previously from a call to PySR will be used."""
1140
+
1141
+ try:
1142
+ if self.multioutput:
1143
+ all_outputs = []
1144
+ for i in range(1, self.nout + 1):
1145
+ df = pd.read_csv(
1146
+ str(self.equation_file) + f".out{i}" + ".bkup",
1147
+ sep="|",
1148
+ )
1149
+ # Rename Complexity column to complexity:
1150
+ df.rename(
1151
+ columns={
1152
+ "Complexity": "complexity",
1153
+ "MSE": "loss",
1154
+ "Equation": "equation",
1155
+ },
1156
+ inplace=True,
1157
+ )
1158
+
1159
+ all_outputs.append(df)
1160
+ else:
1161
+ all_outputs = [pd.read_csv(str(self.equation_file) + ".bkup", sep="|")]
1162
+ all_outputs[-1].rename(
1163
+ columns={
1164
+ "Complexity": "complexity",
1165
+ "MSE": "loss",
1166
+ "Equation": "equation",
1167
+ },
1168
+ inplace=True,
1169
+ )
1170
+ except FileNotFoundError:
1171
+ raise RuntimeError(
1172
+ "Couldn't find equation file! The equation search likely exited before a single iteration completed."
1173
+ )
1174
+
1175
+ ret_outputs = []
1176
+
1177
+ for output in all_outputs:
1178
+
1179
+ scores = []
1180
+ lastMSE = None
1181
+ lastComplexity = 0
1182
+ sympy_format = []
1183
+ lambda_format = []
1184
+ if self.output_jax_format:
1185
+ jax_format = []
1186
+ if self.output_torch_format:
1187
+ torch_format = []
1188
+ use_custom_variable_names = len(self.variable_names) != 0
1189
+ local_sympy_mappings = {
1190
+ **self.extra_sympy_mappings,
1191
+ **sympy_mappings,
1192
+ }
1193
+
1194
+ if use_custom_variable_names:
1195
+ sympy_symbols = [
1196
+ sympy.Symbol(self.variable_names[i]) for i in range(self.n_features)
1197
+ ]
1198
+ else:
1199
+ sympy_symbols = [
1200
+ sympy.Symbol("x%d" % i) for i in range(self.n_features)
1201
+ ]
1202
+
1203
+ for _, eqn_row in output.iterrows():
1204
+ eqn = sympify(eqn_row["equation"], locals=local_sympy_mappings)
1205
+ sympy_format.append(eqn)
1206
+
1207
+ # Numpy:
1208
+ lambda_format.append(
1209
+ CallableEquation(
1210
+ sympy_symbols, eqn, self.selection, self.variable_names
1211
+ )
1212
+ )
1213
+
1214
+ # JAX:
1215
+ if self.output_jax_format:
1216
+ from .export_jax import sympy2jax
1217
+
1218
+ func, params = sympy2jax(
1219
+ eqn,
1220
+ sympy_symbols,
1221
+ selection=self.selection,
1222
+ extra_jax_mappings=self.extra_jax_mappings,
1223
+ )
1224
+ jax_format.append({"callable": func, "parameters": params})
1225
+
1226
+ # Torch:
1227
+ if self.output_torch_format:
1228
+ from .export_torch import sympy2torch
1229
+
1230
+ module = sympy2torch(
1231
+ eqn,
1232
+ sympy_symbols,
1233
+ selection=self.selection,
1234
+ extra_torch_mappings=self.extra_torch_mappings,
1235
+ )
1236
+ torch_format.append(module)
1237
+
1238
+ curMSE = eqn_row["loss"]
1239
+ curComplexity = eqn_row["complexity"]
1240
+
1241
+ if lastMSE is None:
1242
+ cur_score = 0.0
1243
+ else:
1244
+ if curMSE > 0.0:
1245
+ cur_score = -np.log(curMSE / lastMSE) / (
1246
+ curComplexity - lastComplexity
1247
+ )
1248
+ else:
1249
+ cur_score = np.inf
1250
+
1251
+ scores.append(cur_score)
1252
+ lastMSE = curMSE
1253
+ lastComplexity = curComplexity
1254
+
1255
+ output["score"] = np.array(scores)
1256
+ output["sympy_format"] = sympy_format
1257
+ output["lambda_format"] = lambda_format
1258
+ output_cols = [
1259
+ "complexity",
1260
+ "loss",
1261
+ "score",
1262
+ "equation",
1263
+ "sympy_format",
1264
+ "lambda_format",
1265
+ ]
1266
+ if self.output_jax_format:
1267
+ output_cols += ["jax_format"]
1268
+ output["jax_format"] = jax_format
1269
+ if self.output_torch_format:
1270
+ output_cols += ["torch_format"]
1271
+ output["torch_format"] = torch_format
1272
+
1273
+ ret_outputs.append(output[output_cols])
1274
+
1275
+ if self.multioutput:
1276
+ return ret_outputs
1277
+ return ret_outputs[0]
1278
+
1279
+ def score(self, X, y):
1280
+ del X
1281
+ del y
1282
+ raise NotImplementedError
setup.py CHANGED
@@ -1,7 +1,10 @@
1
  import setuptools
2
 
3
- with open("README.md", "r") as fh:
4
- long_description = fh.read()
 
 
 
5
 
6
  setuptools.setup(
7
  name="pysr",
@@ -12,7 +15,7 @@ setuptools.setup(
12
  long_description=long_description,
13
  long_description_content_type="text/markdown",
14
  url="https://github.com/MilesCranmer/pysr",
15
- install_requires=["julia", "numpy", "pandas", "sympy"],
16
  packages=setuptools.find_packages(),
17
  package_data={"pysr": ["../Project.toml", "../datasets/*"]},
18
  include_package_data=False,
 
1
  import setuptools
2
 
3
+ try:
4
+ with open("README.md", "r") as fh:
5
+ long_description = fh.read()
6
+ except FileNotFoundError:
7
+ long_description = ""
8
 
9
  setuptools.setup(
10
  name="pysr",
 
15
  long_description=long_description,
16
  long_description_content_type="text/markdown",
17
  url="https://github.com/MilesCranmer/pysr",
18
+ install_requires=["julia", "numpy", "pandas", "sympy", "scikit-learn"],
19
  packages=setuptools.find_packages(),
20
  package_data={"pysr": ["../Project.toml", "../datasets/*"]},
21
  include_package_data=False,
test/test.py CHANGED
@@ -1,8 +1,8 @@
1
  import unittest
2
  from unittest.mock import patch
3
  import numpy as np
4
- from pysr import pysr, get_hof, best, best_tex, best_callable, best_row
5
- from pysr.sr import run_feature_selection, _handle_feature_selection, _yesno
6
  import sympy
7
  from sympy import lambdify
8
  import pandas as pd
@@ -21,32 +21,33 @@ class TestPipeline(unittest.TestCase):
21
 
22
  def test_linear_relation(self):
23
  y = self.X[:, 0]
24
- equations = pysr(self.X, y, **self.default_test_kwargs)
25
- print(equations)
26
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
 
 
27
 
28
  def test_multiprocessing(self):
29
  y = self.X[:, 0]
30
- equations = pysr(
31
- self.X, y, **self.default_test_kwargs, procs=2, multithreading=False
32
- )
33
- print(equations)
34
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
35
 
36
  def test_multioutput_custom_operator(self):
37
  y = self.X[:, [0, 1]] ** 2
38
- equations = pysr(
39
- self.X,
40
- y,
41
  unary_operators=["sq(x) = x^2"],
42
- binary_operators=["plus"],
43
  extra_sympy_mappings={"sq": lambda x: x ** 2},
 
44
  **self.default_test_kwargs,
45
  procs=0,
46
  )
 
 
47
  print(equations)
48
- self.assertLessEqual(equations[0].iloc[-1]["MSE"], 1e-4)
49
- self.assertLessEqual(equations[1].iloc[-1]["MSE"], 1e-4)
50
 
51
  def test_multioutput_weighted_with_callable_temp_equation(self):
52
  y = self.X[:, [0, 1]] ** 2
@@ -58,10 +59,7 @@ class TestPipeline(unittest.TestCase):
58
  y = (2 - w) * y
59
  # Thus, pysr needs to use the weights to find the right equation!
60
 
61
- pysr(
62
- self.X,
63
- y,
64
- weights=w,
65
  unary_operators=["sq(x) = x^2"],
66
  binary_operators=["plus"],
67
  extra_sympy_mappings={"sq": lambda x: x ** 2},
@@ -70,34 +68,46 @@ class TestPipeline(unittest.TestCase):
70
  temp_equation_file=True,
71
  delete_tempfiles=False,
72
  )
 
73
 
74
  np.testing.assert_almost_equal(
75
- best_callable()[0](self.X), self.X[:, 0] ** 2, decimal=4
76
  )
77
  np.testing.assert_almost_equal(
78
- best_callable()[1](self.X), self.X[:, 1] ** 2, decimal=4
79
  )
80
 
81
- def test_empty_operators_single_input(self):
82
  X = np.random.randn(100, 1)
83
  y = X[:, 0] + 3.0
84
- equations = pysr(
85
- X,
86
- y,
87
  unary_operators=[],
88
  binary_operators=["plus"],
89
  **self.default_test_kwargs,
90
  )
 
 
 
 
91
 
92
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
 
 
 
 
 
 
 
 
 
 
93
 
94
  def test_noisy(self):
95
 
96
  np.random.seed(1)
97
  y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
98
- equations = pysr(
99
- self.X,
100
- y,
101
  # Test that passing a single operator works:
102
  unary_operators="sq(x) = x^2",
103
  binary_operators="plus",
@@ -106,8 +116,9 @@ class TestPipeline(unittest.TestCase):
106
  procs=0,
107
  denoise=True,
108
  )
109
- self.assertLessEqual(best_row(equations=equations)[0]["MSE"], 1e-2)
110
- self.assertLessEqual(best_row(equations=equations)[1]["MSE"], 1e-2)
 
111
 
112
  def test_pandas_resample(self):
113
  np.random.seed(1)
@@ -130,9 +141,7 @@ class TestPipeline(unittest.TestCase):
130
  "T": np.random.randn(100),
131
  }
132
  )
133
- equations = pysr(
134
- X,
135
- y,
136
  unary_operators=[],
137
  binary_operators=["+", "*", "/", "-"],
138
  **self.default_test_kwargs,
@@ -140,11 +149,12 @@ class TestPipeline(unittest.TestCase):
140
  denoise=True,
141
  select_k_features=2,
142
  )
143
- self.assertNotIn("unused_feature", best_tex())
144
- self.assertIn("T", best_tex())
145
- self.assertIn("x", best_tex())
146
- self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-2)
147
- fn = best_callable()
 
148
  self.assertListEqual(list(sorted(fn._selection)), [0, 1])
149
  X2 = pd.DataFrame(
150
  {
@@ -154,44 +164,45 @@ class TestPipeline(unittest.TestCase):
154
  }
155
  )
156
  self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
 
157
 
158
 
159
  class TestBest(unittest.TestCase):
160
  def setUp(self):
161
  equations = pd.DataFrame(
162
  {
163
- "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
164
- "MSE": [1.0, 0.1, 1e-5],
165
- "Complexity": [1, 2, 3],
166
  }
167
  )
168
 
169
- equations["Complexity MSE Equation".split(" ")].to_csv(
170
  "equation_file.csv.bkup", sep="|"
171
  )
172
 
173
- self.equations = get_hof(
174
- "equation_file.csv",
175
- n_features=2,
176
  variables_names="x0 x1".split(" "),
177
  extra_sympy_mappings={},
178
  output_jax_format=False,
179
  multioutput=False,
180
  nout=1,
181
  )
 
 
 
182
 
183
  def test_best(self):
184
- self.assertEqual(best(self.equations), sympy.cos(sympy.Symbol("x0")) ** 2)
185
- self.assertEqual(best(), sympy.cos(sympy.Symbol("x0")) ** 2)
186
 
187
  def test_best_tex(self):
188
- self.assertEqual(best_tex(self.equations), "\\cos^{2}{\\left(x_{0} \\right)}")
189
- self.assertEqual(best_tex(), "\\cos^{2}{\\left(x_{0} \\right)}")
190
 
191
  def test_best_lambda(self):
192
  X = np.random.randn(10, 2)
193
  y = np.cos(X[:, 0]) ** 2
194
- for f in [best_callable(), best_callable(self.equations)]:
195
  np.testing.assert_almost_equal(f(X), y, decimal=4)
196
 
197
 
@@ -221,11 +232,3 @@ class TestFeatureSelection(unittest.TestCase):
221
  np.testing.assert_array_equal(
222
  np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
223
  )
224
-
225
-
226
- class TestHelperFunctions(unittest.TestCase):
227
- @patch("builtins.input", side_effect=["y", "n"])
228
- def test_yesno(self, mock_input):
229
- # Assert that the yes/no function correctly deals with y/n
230
- self.assertEqual(_yesno("Test"), True)
231
- self.assertEqual(_yesno("Test"), False)
 
1
  import unittest
2
  from unittest.mock import patch
3
  import numpy as np
4
+ from pysr import PySRRegressor
5
+ from pysr.sr import run_feature_selection, _handle_feature_selection
6
  import sympy
7
  from sympy import lambdify
8
  import pandas as pd
 
21
 
22
  def test_linear_relation(self):
23
  y = self.X[:, 0]
24
+ model = PySRRegressor(**self.default_test_kwargs)
25
+ model.fit(self.X, y)
26
+ model.set_params(model_selection="accuracy")
27
+ print(model.equations)
28
+ self.assertLessEqual(model.get_best()["loss"], 1e-4)
29
 
30
  def test_multiprocessing(self):
31
  y = self.X[:, 0]
32
+ model = PySRRegressor(**self.default_test_kwargs, procs=2, multithreading=False)
33
+ model.fit(self.X, y)
34
+ print(model.equations)
35
+ self.assertLessEqual(model.equations.iloc[-1]["loss"], 1e-4)
 
36
 
37
  def test_multioutput_custom_operator(self):
38
  y = self.X[:, [0, 1]] ** 2
39
+ model = PySRRegressor(
 
 
40
  unary_operators=["sq(x) = x^2"],
 
41
  extra_sympy_mappings={"sq": lambda x: x ** 2},
42
+ binary_operators=["plus"],
43
  **self.default_test_kwargs,
44
  procs=0,
45
  )
46
+ model.fit(self.X, y)
47
+ equations = model.equations
48
  print(equations)
49
+ self.assertLessEqual(equations[0].iloc[-1]["loss"], 1e-4)
50
+ self.assertLessEqual(equations[1].iloc[-1]["loss"], 1e-4)
51
 
52
  def test_multioutput_weighted_with_callable_temp_equation(self):
53
  y = self.X[:, [0, 1]] ** 2
 
59
  y = (2 - w) * y
60
  # Thus, pysr needs to use the weights to find the right equation!
61
 
62
+ model = PySRRegressor(
 
 
 
63
  unary_operators=["sq(x) = x^2"],
64
  binary_operators=["plus"],
65
  extra_sympy_mappings={"sq": lambda x: x ** 2},
 
68
  temp_equation_file=True,
69
  delete_tempfiles=False,
70
  )
71
+ model.fit(self.X, y, weights=w)
72
 
73
  np.testing.assert_almost_equal(
74
+ model.predict(self.X)[:, 0], self.X[:, 0] ** 2, decimal=4
75
  )
76
  np.testing.assert_almost_equal(
77
+ model.predict(self.X)[:, 1], self.X[:, 1] ** 2, decimal=4
78
  )
79
 
80
+ def test_empty_operators_single_input_sklearn(self):
81
  X = np.random.randn(100, 1)
82
  y = X[:, 0] + 3.0
83
+ regressor = PySRRegressor(
84
+ model_selection="accuracy",
 
85
  unary_operators=[],
86
  binary_operators=["plus"],
87
  **self.default_test_kwargs,
88
  )
89
+ self.assertTrue("None" in regressor.__repr__())
90
+ regressor.fit(X, y)
91
+ self.assertTrue("None" not in regressor.__repr__())
92
+ self.assertTrue(">>>>" in regressor.__repr__())
93
 
94
+ self.assertLessEqual(regressor.equations.iloc[-1]["loss"], 1e-4)
95
+ np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
96
+
97
+ # Tweak model selection:
98
+ regressor.set_params(model_selection="best")
99
+ self.assertEqual(regressor.get_params()["model_selection"], "best")
100
+ self.assertTrue("None" not in regressor.__repr__())
101
+ self.assertTrue(">>>>" in regressor.__repr__())
102
+
103
+ # "best" model_selection should also give a decent loss:
104
+ np.testing.assert_almost_equal(regressor.predict(X), y, decimal=1)
105
 
106
  def test_noisy(self):
107
 
108
  np.random.seed(1)
109
  y = self.X[:, [0, 1]] ** 2 + np.random.randn(self.X.shape[0], 1) * 0.05
110
+ model = PySRRegressor(
 
 
111
  # Test that passing a single operator works:
112
  unary_operators="sq(x) = x^2",
113
  binary_operators="plus",
 
116
  procs=0,
117
  denoise=True,
118
  )
119
+ model.fit(self.X, y)
120
+ self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
121
+ self.assertLessEqual(model.get_best()[1]["loss"], 1e-2)
122
 
123
  def test_pandas_resample(self):
124
  np.random.seed(1)
 
141
  "T": np.random.randn(100),
142
  }
143
  )
144
+ model = PySRRegressor(
 
 
145
  unary_operators=[],
146
  binary_operators=["+", "*", "/", "-"],
147
  **self.default_test_kwargs,
 
149
  denoise=True,
150
  select_k_features=2,
151
  )
152
+ model.fit(X, y)
153
+ self.assertNotIn("unused_feature", model.latex())
154
+ self.assertIn("T", model.latex())
155
+ self.assertIn("x", model.latex())
156
+ self.assertLessEqual(model.get_best()["loss"], 1e-2)
157
+ fn = model.get_best()["lambda_format"]
158
  self.assertListEqual(list(sorted(fn._selection)), [0, 1])
159
  X2 = pd.DataFrame(
160
  {
 
164
  }
165
  )
166
  self.assertLess(np.average((fn(X2) - true_fn(X2)) ** 2), 1e-2)
167
+ self.assertLess(np.average((model.predict(X2) - true_fn(X2)) ** 2), 1e-2)
168
 
169
 
170
  class TestBest(unittest.TestCase):
171
  def setUp(self):
172
  equations = pd.DataFrame(
173
  {
174
+ "equation": ["1.0", "cos(x0)", "square(cos(x0))"],
175
+ "loss": [1.0, 0.1, 1e-5],
176
+ "complexity": [1, 2, 3],
177
  }
178
  )
179
 
180
+ equations["complexity loss equation".split(" ")].to_csv(
181
  "equation_file.csv.bkup", sep="|"
182
  )
183
 
184
+ self.model = PySRRegressor(
185
+ equation_file="equation_file.csv",
 
186
  variables_names="x0 x1".split(" "),
187
  extra_sympy_mappings={},
188
  output_jax_format=False,
189
  multioutput=False,
190
  nout=1,
191
  )
192
+ self.model.n_features = 2
193
+ self.model.refresh()
194
+ self.equations = self.model.equations
195
 
196
  def test_best(self):
197
+ self.assertEqual(self.model.sympy(), sympy.cos(sympy.Symbol("x0")) ** 2)
 
198
 
199
  def test_best_tex(self):
200
+ self.assertEqual(self.model.latex(), "\\cos^{2}{\\left(x_{0} \\right)}")
 
201
 
202
  def test_best_lambda(self):
203
  X = np.random.randn(10, 2)
204
  y = np.cos(X[:, 0]) ** 2
205
+ for f in [self.model.predict, self.equations.iloc[-1]["lambda_format"]]:
206
  np.testing.assert_almost_equal(f(X), y, decimal=4)
207
 
208
 
 
232
  np.testing.assert_array_equal(
233
  np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
234
  )
 
 
 
 
 
 
 
 
test/test_jax.py CHANGED
@@ -1,6 +1,6 @@
1
  import unittest
2
  import numpy as np
3
- from pysr import sympy2jax, get_hof
4
  import pandas as pd
5
  from jax import numpy as jnp
6
  from jax import random
@@ -25,7 +25,7 @@ class TestJAX(unittest.TestCase):
25
  X = np.random.randn(100, 10)
26
  equations = pd.DataFrame(
27
  {
28
- "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
29
  "MSE": [1.0, 0.1, 1e-5],
30
  "Complexity": [1, 2, 3],
31
  }
@@ -35,18 +35,20 @@ class TestJAX(unittest.TestCase):
35
  "equation_file.csv.bkup", sep="|"
36
  )
37
 
38
- equations = get_hof(
39
- "equation_file.csv",
40
- n_features=2,
41
- variables_names="x1 x2 x3".split(" "),
42
- extra_sympy_mappings={},
43
  output_jax_format=True,
 
44
  multioutput=False,
45
  nout=1,
46
  selection=[1, 2, 3],
47
  )
48
 
49
- jformat = equations.iloc[-1].jax_format
 
 
 
 
50
  np.testing.assert_almost_equal(
51
  np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
52
  np.square(np.cos(X[:, 1])), # Select feature 1
 
1
  import unittest
2
  import numpy as np
3
+ from pysr import sympy2jax, PySRRegressor
4
  import pandas as pd
5
  from jax import numpy as jnp
6
  from jax import random
 
25
  X = np.random.randn(100, 10)
26
  equations = pd.DataFrame(
27
  {
28
+ "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
29
  "MSE": [1.0, 0.1, 1e-5],
30
  "Complexity": [1, 2, 3],
31
  }
 
35
  "equation_file.csv.bkup", sep="|"
36
  )
37
 
38
+ model = PySRRegressor(
39
+ equation_file="equation_file.csv",
 
 
 
40
  output_jax_format=True,
41
+ variables_names="x1 x2 x3".split(" "),
42
  multioutput=False,
43
  nout=1,
44
  selection=[1, 2, 3],
45
  )
46
 
47
+ model.n_features = 2
48
+ model.using_pandas = False
49
+ model.refresh()
50
+ jformat = model.jax()
51
+
52
  np.testing.assert_almost_equal(
53
  np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
54
  np.square(np.cos(X[:, 1])), # Select feature 1
test/test_torch.py CHANGED
@@ -1,7 +1,7 @@
1
  import unittest
2
  import numpy as np
3
  import pandas as pd
4
- from pysr import sympy2torch, get_hof
5
  import torch
6
  import sympy
7
 
@@ -24,7 +24,7 @@ class TestTorch(unittest.TestCase):
24
  X = np.random.randn(100, 10)
25
  equations = pd.DataFrame(
26
  {
27
- "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
28
  "MSE": [1.0, 0.1, 1e-5],
29
  "Complexity": [1, 2, 3],
30
  }
@@ -34,9 +34,9 @@ class TestTorch(unittest.TestCase):
34
  "equation_file.csv.bkup", sep="|"
35
  )
36
 
37
- equations = get_hof(
38
- "equation_file.csv",
39
- n_features=2, # TODO: Why is this 2 and not 3?
40
  variables_names="x1 x2 x3".split(" "),
41
  extra_sympy_mappings={},
42
  output_torch_format=True,
@@ -44,8 +44,12 @@ class TestTorch(unittest.TestCase):
44
  nout=1,
45
  selection=[1, 2, 3],
46
  )
 
 
 
47
 
48
- tformat = equations.iloc[-1].torch_format
 
49
  np.testing.assert_almost_equal(
50
  tformat(torch.tensor(X)).detach().numpy(),
51
  np.square(np.cos(X[:, 1])), # Selection 1st feature
@@ -84,9 +88,9 @@ class TestTorch(unittest.TestCase):
84
  "equation_file_custom_operator.csv.bkup", sep="|"
85
  )
86
 
87
- equations = get_hof(
88
- "equation_file_custom_operator.csv",
89
- n_features=3,
90
  variables_names="x1 x2 x3".split(" "),
91
  extra_sympy_mappings={"mycustomoperator": sympy.sin},
92
  extra_torch_mappings={"mycustomoperator": torch.sin},
@@ -95,8 +99,13 @@ class TestTorch(unittest.TestCase):
95
  nout=1,
96
  selection=[0, 1, 2],
97
  )
 
 
 
 
 
 
98
 
99
- tformat = equations.iloc[-1].torch_format
100
  np.testing.assert_almost_equal(
101
  tformat(torch.tensor(X)).detach().numpy(),
102
  np.sin(X[:, 0]), # Selection 1st feature
 
1
  import unittest
2
  import numpy as np
3
  import pandas as pd
4
+ from pysr import sympy2torch, PySRRegressor
5
  import torch
6
  import sympy
7
 
 
24
  X = np.random.randn(100, 10)
25
  equations = pd.DataFrame(
26
  {
27
+ "Equation": ["1.0", "cos(x1)", "square(cos(x1))"],
28
  "MSE": [1.0, 0.1, 1e-5],
29
  "Complexity": [1, 2, 3],
30
  }
 
34
  "equation_file.csv.bkup", sep="|"
35
  )
36
 
37
+ model = PySRRegressor(
38
+ model_selection="accuracy",
39
+ equation_file="equation_file.csv",
40
  variables_names="x1 x2 x3".split(" "),
41
  extra_sympy_mappings={},
42
  output_torch_format=True,
 
44
  nout=1,
45
  selection=[1, 2, 3],
46
  )
47
+ model.n_features = 2 # TODO: Why is this 2 and not 3?
48
+ model.using_pandas = False
49
+ model.refresh()
50
 
51
+ tformat = model.pytorch()
52
+ self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)")
53
  np.testing.assert_almost_equal(
54
  tformat(torch.tensor(X)).detach().numpy(),
55
  np.square(np.cos(X[:, 1])), # Selection 1st feature
 
88
  "equation_file_custom_operator.csv.bkup", sep="|"
89
  )
90
 
91
+ model = PySRRegressor(
92
+ model_selection="accuracy",
93
+ equation_file="equation_file_custom_operator.csv",
94
  variables_names="x1 x2 x3".split(" "),
95
  extra_sympy_mappings={"mycustomoperator": sympy.sin},
96
  extra_torch_mappings={"mycustomoperator": torch.sin},
 
99
  nout=1,
100
  selection=[0, 1, 2],
101
  )
102
+ model.n_features = 3
103
+ model.using_pandas = False
104
+ model.refresh()
105
+ # Will automatically use the set global state from get_hof.
106
+ tformat = model.pytorch()
107
+ self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x0))")
108
 
 
109
  np.testing.assert_almost_equal(
110
  tformat(torch.tensor(X)).detach().numpy(),
111
  np.sin(X[:, 0]), # Selection 1st feature