Spaces:
Running
Running
MilesCranmer
commited on
Commit
•
c28a133
1
Parent(s):
a30a767
Add weighted sum
Browse files- README.md +3 -2
- julia/sr.jl +5 -1
- pysr/sr.py +12 -3
README.md
CHANGED
@@ -131,7 +131,7 @@ which is `hall_of_fame.csv` by default. It also prints the
|
|
131 |
equations to stdout.
|
132 |
|
133 |
```python
|
134 |
-
pysr(X=None, y=None, threads=4, niterations=100, ncyclesperiteration=300, binary_operators=["plus", "mult"], unary_operators=["cos", "exp", "sin"], alpha=0.1, annealing=True, fractionReplaced=0.10, fractionReplacedHof=0.10, npop=1000, parsimony=1e-4, migration=True, hofMigration=True, shouldOptimizeConstants=True, topn=10, weightAddNode=1, weightInsertNode=3, weightDeleteNode=3, weightDoNothing=1, weightMutateConstant=10, weightMutateOperator=1, weightRandomize=1, weightSimplify=0.01, perturbationFactor=1.0, nrestarts=3, timeout=None, equation_file='hall_of_fame.csv', test='simple1', verbosity=1e9, maxsize=20)
|
135 |
```
|
136 |
|
137 |
Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
|
@@ -143,6 +143,7 @@ equations, but you should adjust `threads`, `niterations`,
|
|
143 |
|
144 |
- `X`: np.ndarray, 2D array. Rows are examples, columns are features.
|
145 |
- `y`: np.ndarray, 1D array. Rows are examples.
|
|
|
146 |
- `threads`: int, Number of threads (=number of populations running).
|
147 |
You can have more threads than cores - it actually makes it more
|
148 |
efficient.
|
@@ -196,7 +197,6 @@ pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
|
|
196 |
|
197 |
# TODO
|
198 |
|
199 |
-
- [ ] Add error bar capability (thanks Johannes Buchner)
|
200 |
- [ ] Add ability to save state from python
|
201 |
- [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features.
|
202 |
- Store feature importances of future, and periodically update it.
|
@@ -211,6 +211,7 @@ pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
|
|
211 |
- Current most expensive operations:
|
212 |
- [ ] Calculating the loss function - there is duplicate calculations happening.
|
213 |
- [x] Declaration of the weights array every iteration
|
|
|
214 |
- [x] Why don't the constants continually change? It should optimize them every time the equation appears.
|
215 |
- Restart the optimizer to help with this.
|
216 |
- [x] Add several common unary and binary operators; list these.
|
|
|
131 |
equations to stdout.
|
132 |
|
133 |
```python
|
134 |
+
pysr(X=None, y=None, weights=None, threads=4, niterations=100, ncyclesperiteration=300, binary_operators=["plus", "mult"], unary_operators=["cos", "exp", "sin"], alpha=0.1, annealing=True, fractionReplaced=0.10, fractionReplacedHof=0.10, npop=1000, parsimony=1e-4, migration=True, hofMigration=True, shouldOptimizeConstants=True, topn=10, weightAddNode=1, weightInsertNode=3, weightDeleteNode=3, weightDoNothing=1, weightMutateConstant=10, weightMutateOperator=1, weightRandomize=1, weightSimplify=0.01, perturbationFactor=1.0, nrestarts=3, timeout=None, equation_file='hall_of_fame.csv', test='simple1', verbosity=1e9, maxsize=20)
|
135 |
```
|
136 |
|
137 |
Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
|
|
|
143 |
|
144 |
- `X`: np.ndarray, 2D array. Rows are examples, columns are features.
|
145 |
- `y`: np.ndarray, 1D array. Rows are examples.
|
146 |
+
- `weights`: np.ndarray, 1D array. Same shape as `y`. Optional weighted sum (e.g., 1/error^2).
|
147 |
- `threads`: int, Number of threads (=number of populations running).
|
148 |
You can have more threads than cores - it actually makes it more
|
149 |
efficient.
|
|
|
197 |
|
198 |
# TODO
|
199 |
|
|
|
200 |
- [ ] Add ability to save state from python
|
201 |
- [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features.
|
202 |
- Store feature importances of future, and periodically update it.
|
|
|
211 |
- Current most expensive operations:
|
212 |
- [ ] Calculating the loss function - there is duplicate calculations happening.
|
213 |
- [x] Declaration of the weights array every iteration
|
214 |
+
- [x] Add error bar capability (thanks Johannes Buchner for suggestion)
|
215 |
- [x] Why don't the constants continually change? It should optimize them every time the equation appears.
|
216 |
- Restart the optimizer to help with this.
|
217 |
- [x] Add several common unary and binary operators; list these.
|
julia/sr.jl
CHANGED
@@ -6,7 +6,11 @@ const actualMaxsize = maxsize + maxdegree
|
|
6 |
|
7 |
# Sum of square error between two arrays
|
8 |
function SSE(x::Array{Float32}, y::Array{Float32})::Float32
|
9 |
-
|
|
|
|
|
|
|
|
|
10 |
return sum(diff .* diff)
|
11 |
end
|
12 |
|
|
|
6 |
|
7 |
# Sum of square error between two arrays
|
8 |
function SSE(x::Array{Float32}, y::Array{Float32})::Float32
|
9 |
+
if weighted
|
10 |
+
diff = (x - y) .* weights
|
11 |
+
else
|
12 |
+
diff = (x - y)
|
13 |
+
end
|
14 |
return sum(diff .* diff)
|
15 |
end
|
16 |
|
pysr/sr.py
CHANGED
@@ -5,7 +5,7 @@ import pathlib
|
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
|
8 |
-
def pysr(X=None, y=None, threads=4,
|
9 |
niterations=100,
|
10 |
ncyclesperiteration=300,
|
11 |
binary_operators=["plus", "mult"],
|
@@ -131,6 +131,7 @@ const nthreads = {threads:d}
|
|
131 |
const nrestarts = {nrestarts:d}
|
132 |
const perturbationFactor = {perturbationFactor:f}f0
|
133 |
const annealing = {"true" if annealing else "false"}
|
|
|
134 |
const mutationWeights = [
|
135 |
{weightMutateConstant:f},
|
136 |
{weightMutateOperator:f},
|
@@ -145,13 +146,21 @@ const mutationWeights = [
|
|
145 |
|
146 |
assert len(X.shape) == 2
|
147 |
assert len(y.shape) == 1
|
|
|
|
|
|
|
|
|
148 |
|
149 |
X_str = str(X.tolist()).replace('],', '];').replace(',', '')
|
150 |
y_str = str(y.tolist())
|
151 |
|
152 |
def_datasets = """const X = convert(Array{Float32, 2}, """f"{X_str})""""
|
153 |
-
const y = convert(Array{Float32, 1}, """f"{y_str})"
|
154 |
-
|
|
|
|
|
|
|
|
|
155 |
|
156 |
with open(f'/tmp/.hyperparams_{rand_string}.jl', 'w') as f:
|
157 |
print(def_hyperparams, file=f)
|
|
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
|
8 |
+
def pysr(X=None, y=None, weights=None, threads=4,
|
9 |
niterations=100,
|
10 |
ncyclesperiteration=300,
|
11 |
binary_operators=["plus", "mult"],
|
|
|
131 |
const nrestarts = {nrestarts:d}
|
132 |
const perturbationFactor = {perturbationFactor:f}f0
|
133 |
const annealing = {"true" if annealing else "false"}
|
134 |
+
const weighted = {"true" if weights is not None else "false"}
|
135 |
const mutationWeights = [
|
136 |
{weightMutateConstant:f},
|
137 |
{weightMutateOperator:f},
|
|
|
146 |
|
147 |
assert len(X.shape) == 2
|
148 |
assert len(y.shape) == 1
|
149 |
+
assert X.shape[0] == y.shape[0]
|
150 |
+
if weights is not None:
|
151 |
+
assert len(weights.shape) == 1
|
152 |
+
assert X.shape[0] == weights.shape[0]
|
153 |
|
154 |
X_str = str(X.tolist()).replace('],', '];').replace(',', '')
|
155 |
y_str = str(y.tolist())
|
156 |
|
157 |
def_datasets = """const X = convert(Array{Float32, 2}, """f"{X_str})""""
|
158 |
+
const y = convert(Array{Float32, 1}, """f"{y_str})"
|
159 |
+
|
160 |
+
if weights is not None:
|
161 |
+
weight_str = str(weights.tolist())
|
162 |
+
def_datasets += """
|
163 |
+
const weights = convert(Array{Float32, 1}, """f"{weight_str})"
|
164 |
|
165 |
with open(f'/tmp/.hyperparams_{rand_string}.jl', 'w') as f:
|
166 |
print(def_hyperparams, file=f)
|