Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Sep 21, 2020

Commit

c28a133

1 Parent(s): a30a767

Add weighted sum

Browse files

Files changed (3) hide show

README.md +3 -2
julia/sr.jl +5 -1
pysr/sr.py +12 -3

README.md CHANGED Viewed

@@ -131,7 +131,7 @@ which is `hall_of_fame.csv` by default. It also prints the
 equations to stdout.
 ```python
-pysr(X=None, y=None, threads=4, niterations=100, ncyclesperiteration=300, binary_operators=["plus", "mult"], unary_operators=["cos", "exp", "sin"], alpha=0.1, annealing=True, fractionReplaced=0.10, fractionReplacedHof=0.10, npop=1000, parsimony=1e-4, migration=True, hofMigration=True, shouldOptimizeConstants=True, topn=10, weightAddNode=1, weightInsertNode=3, weightDeleteNode=3, weightDoNothing=1, weightMutateConstant=10, weightMutateOperator=1, weightRandomize=1, weightSimplify=0.01, perturbationFactor=1.0, nrestarts=3, timeout=None, equation_file='hall_of_fame.csv', test='simple1', verbosity=1e9, maxsize=20)
 ```
 Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
@@ -143,6 +143,7 @@ equations, but you should adjust `threads`, `niterations`,
 - `X`: np.ndarray, 2D array. Rows are examples, columns are features.
 - `y`: np.ndarray, 1D array. Rows are examples.
 - `threads`: int, Number of threads (=number of populations running).
 You can have more threads than cores - it actually makes it more
 efficient.
@@ -196,7 +197,6 @@ pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
 # TODO
-- [ ] Add error bar capability (thanks Johannes Buchner)
 - [ ] Add ability to save state from python
 - [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features.
     - Store feature importances of future, and periodically update it.
@@ -211,6 +211,7 @@ pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
     - Current most expensive operations:
         - [ ] Calculating the loss function - there is duplicate calculations happening.
         - [x] Declaration of the weights array every iteration
 - [x] Why don't the constants continually change? It should optimize them every time the equation appears.
     - Restart the optimizer to help with this.
 - [x] Add several common unary and binary operators; list these.

 equations to stdout.
 ```python
+pysr(X=None, y=None, weights=None, threads=4, niterations=100, ncyclesperiteration=300, binary_operators=["plus", "mult"], unary_operators=["cos", "exp", "sin"], alpha=0.1, annealing=True, fractionReplaced=0.10, fractionReplacedHof=0.10, npop=1000, parsimony=1e-4, migration=True, hofMigration=True, shouldOptimizeConstants=True, topn=10, weightAddNode=1, weightInsertNode=3, weightDeleteNode=3, weightDoNothing=1, weightMutateConstant=10, weightMutateOperator=1, weightRandomize=1, weightSimplify=0.01, perturbationFactor=1.0, nrestarts=3, timeout=None, equation_file='hall_of_fame.csv', test='simple1', verbosity=1e9, maxsize=20)
 ```
 Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
 - `X`: np.ndarray, 2D array. Rows are examples, columns are features.
 - `y`: np.ndarray, 1D array. Rows are examples.
+- `weights`: np.ndarray, 1D array. Same shape as `y`. Optional weighted sum (e.g., 1/error^2).
 - `threads`: int, Number of threads (=number of populations running).
 You can have more threads than cores - it actually makes it more
 efficient.
 # TODO
 - [ ] Add ability to save state from python
 - [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features.
     - Store feature importances of future, and periodically update it.
     - Current most expensive operations:
         - [ ] Calculating the loss function - there is duplicate calculations happening.
         - [x] Declaration of the weights array every iteration
+- [x] Add error bar capability (thanks Johannes Buchner for suggestion)
 - [x] Why don't the constants continually change? It should optimize them every time the equation appears.
     - Restart the optimizer to help with this.
 - [x] Add several common unary and binary operators; list these.

julia/sr.jl CHANGED Viewed

@@ -6,7 +6,11 @@ const actualMaxsize = maxsize + maxdegree
 # Sum of square error between two arrays
 function SSE(x::Array{Float32}, y::Array{Float32})::Float32
-    diff = (x - y)
     return sum(diff .* diff)
 end

 # Sum of square error between two arrays
 function SSE(x::Array{Float32}, y::Array{Float32})::Float32
+    if weighted
+        diff = (x - y) .* weights
+    else
+        diff = (x - y)
+    end
     return sum(diff .* diff)
 end

pysr/sr.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pathlib
 import numpy as np
 import pandas as pd
-def pysr(X=None, y=None, threads=4,
             niterations=100,
             ncyclesperiteration=300,
             binary_operators=["plus", "mult"],
@@ -131,6 +131,7 @@ const nthreads = {threads:d}
 const nrestarts = {nrestarts:d}
 const perturbationFactor = {perturbationFactor:f}f0
 const annealing = {"true" if annealing else "false"}
 const mutationWeights = [
     {weightMutateConstant:f},
     {weightMutateOperator:f},
@@ -145,13 +146,21 @@ const mutationWeights = [
     assert len(X.shape) == 2
     assert len(y.shape) == 1
     X_str = str(X.tolist()).replace('],', '];').replace(',', '')
     y_str = str(y.tolist())
     def_datasets = """const X = convert(Array{Float32, 2}, """f"{X_str})""""
-const y = convert(Array{Float32, 1}, """f"{y_str})""""
-    """
     with open(f'/tmp/.hyperparams_{rand_string}.jl', 'w') as f:
         print(def_hyperparams, file=f)

 import numpy as np
 import pandas as pd
+def pysr(X=None, y=None, weights=None, threads=4,
             niterations=100,
             ncyclesperiteration=300,
             binary_operators=["plus", "mult"],
 const nrestarts = {nrestarts:d}
 const perturbationFactor = {perturbationFactor:f}f0
 const annealing = {"true" if annealing else "false"}
+const weighted = {"true" if weights is not None else "false"}
 const mutationWeights = [
     {weightMutateConstant:f},
     {weightMutateOperator:f},
     assert len(X.shape) == 2
     assert len(y.shape) == 1
+    assert X.shape[0] == y.shape[0]
+    if weights is not None:
+        assert len(weights.shape) == 1
+        assert X.shape[0] == weights.shape[0]
     X_str = str(X.tolist()).replace('],', '];').replace(',', '')
     y_str = str(y.tolist())
     def_datasets = """const X = convert(Array{Float32, 2}, """f"{X_str})""""
+const y = convert(Array{Float32, 1}, """f"{y_str})"
+    if weights is not None:
+        weight_str = str(weights.tolist())
+        def_datasets += """
+const weights = convert(Array{Float32, 1}, """f"{weight_str})"
     with open(f'/tmp/.hyperparams_{rand_string}.jl', 'w') as f:
         print(def_hyperparams, file=f)