MilesCranmer commited on
Commit
c28a133
1 Parent(s): a30a767

Add weighted sum

Browse files
Files changed (3) hide show
  1. README.md +3 -2
  2. julia/sr.jl +5 -1
  3. pysr/sr.py +12 -3
README.md CHANGED
@@ -131,7 +131,7 @@ which is `hall_of_fame.csv` by default. It also prints the
131
  equations to stdout.
132
 
133
  ```python
134
- pysr(X=None, y=None, threads=4, niterations=100, ncyclesperiteration=300, binary_operators=["plus", "mult"], unary_operators=["cos", "exp", "sin"], alpha=0.1, annealing=True, fractionReplaced=0.10, fractionReplacedHof=0.10, npop=1000, parsimony=1e-4, migration=True, hofMigration=True, shouldOptimizeConstants=True, topn=10, weightAddNode=1, weightInsertNode=3, weightDeleteNode=3, weightDoNothing=1, weightMutateConstant=10, weightMutateOperator=1, weightRandomize=1, weightSimplify=0.01, perturbationFactor=1.0, nrestarts=3, timeout=None, equation_file='hall_of_fame.csv', test='simple1', verbosity=1e9, maxsize=20)
135
  ```
136
 
137
  Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
@@ -143,6 +143,7 @@ equations, but you should adjust `threads`, `niterations`,
143
 
144
  - `X`: np.ndarray, 2D array. Rows are examples, columns are features.
145
  - `y`: np.ndarray, 1D array. Rows are examples.
 
146
  - `threads`: int, Number of threads (=number of populations running).
147
  You can have more threads than cores - it actually makes it more
148
  efficient.
@@ -196,7 +197,6 @@ pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
196
 
197
  # TODO
198
 
199
- - [ ] Add error bar capability (thanks Johannes Buchner)
200
  - [ ] Add ability to save state from python
201
  - [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features.
202
  - Store feature importances of future, and periodically update it.
@@ -211,6 +211,7 @@ pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
211
  - Current most expensive operations:
212
  - [ ] Calculating the loss function - there is duplicate calculations happening.
213
  - [x] Declaration of the weights array every iteration
 
214
  - [x] Why don't the constants continually change? It should optimize them every time the equation appears.
215
  - Restart the optimizer to help with this.
216
  - [x] Add several common unary and binary operators; list these.
 
131
  equations to stdout.
132
 
133
  ```python
134
+ pysr(X=None, y=None, weights=None, threads=4, niterations=100, ncyclesperiteration=300, binary_operators=["plus", "mult"], unary_operators=["cos", "exp", "sin"], alpha=0.1, annealing=True, fractionReplaced=0.10, fractionReplacedHof=0.10, npop=1000, parsimony=1e-4, migration=True, hofMigration=True, shouldOptimizeConstants=True, topn=10, weightAddNode=1, weightInsertNode=3, weightDeleteNode=3, weightDoNothing=1, weightMutateConstant=10, weightMutateOperator=1, weightRandomize=1, weightSimplify=0.01, perturbationFactor=1.0, nrestarts=3, timeout=None, equation_file='hall_of_fame.csv', test='simple1', verbosity=1e9, maxsize=20)
135
  ```
136
 
137
  Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
 
143
 
144
  - `X`: np.ndarray, 2D array. Rows are examples, columns are features.
145
  - `y`: np.ndarray, 1D array. Rows are examples.
146
+ - `weights`: np.ndarray, 1D array. Same shape as `y`. Optional weighted sum (e.g., 1/error^2).
147
  - `threads`: int, Number of threads (=number of populations running).
148
  You can have more threads than cores - it actually makes it more
149
  efficient.
 
197
 
198
  # TODO
199
 
 
200
  - [ ] Add ability to save state from python
201
  - [ ] Calculate feature importances of future mutations, by looking at correlation between residual of model, and the features.
202
  - Store feature importances of future, and periodically update it.
 
211
  - Current most expensive operations:
212
  - [ ] Calculating the loss function - there is duplicate calculations happening.
213
  - [x] Declaration of the weights array every iteration
214
+ - [x] Add error bar capability (thanks Johannes Buchner for suggestion)
215
  - [x] Why don't the constants continually change? It should optimize them every time the equation appears.
216
  - Restart the optimizer to help with this.
217
  - [x] Add several common unary and binary operators; list these.
julia/sr.jl CHANGED
@@ -6,7 +6,11 @@ const actualMaxsize = maxsize + maxdegree
6
 
7
  # Sum of square error between two arrays
8
  function SSE(x::Array{Float32}, y::Array{Float32})::Float32
9
- diff = (x - y)
 
 
 
 
10
  return sum(diff .* diff)
11
  end
12
 
 
6
 
7
  # Sum of square error between two arrays
8
  function SSE(x::Array{Float32}, y::Array{Float32})::Float32
9
+ if weighted
10
+ diff = (x - y) .* weights
11
+ else
12
+ diff = (x - y)
13
+ end
14
  return sum(diff .* diff)
15
  end
16
 
pysr/sr.py CHANGED
@@ -5,7 +5,7 @@ import pathlib
5
  import numpy as np
6
  import pandas as pd
7
 
8
- def pysr(X=None, y=None, threads=4,
9
  niterations=100,
10
  ncyclesperiteration=300,
11
  binary_operators=["plus", "mult"],
@@ -131,6 +131,7 @@ const nthreads = {threads:d}
131
  const nrestarts = {nrestarts:d}
132
  const perturbationFactor = {perturbationFactor:f}f0
133
  const annealing = {"true" if annealing else "false"}
 
134
  const mutationWeights = [
135
  {weightMutateConstant:f},
136
  {weightMutateOperator:f},
@@ -145,13 +146,21 @@ const mutationWeights = [
145
 
146
  assert len(X.shape) == 2
147
  assert len(y.shape) == 1
 
 
 
 
148
 
149
  X_str = str(X.tolist()).replace('],', '];').replace(',', '')
150
  y_str = str(y.tolist())
151
 
152
  def_datasets = """const X = convert(Array{Float32, 2}, """f"{X_str})""""
153
- const y = convert(Array{Float32, 1}, """f"{y_str})""""
154
- """
 
 
 
 
155
 
156
  with open(f'/tmp/.hyperparams_{rand_string}.jl', 'w') as f:
157
  print(def_hyperparams, file=f)
 
5
  import numpy as np
6
  import pandas as pd
7
 
8
+ def pysr(X=None, y=None, weights=None, threads=4,
9
  niterations=100,
10
  ncyclesperiteration=300,
11
  binary_operators=["plus", "mult"],
 
131
  const nrestarts = {nrestarts:d}
132
  const perturbationFactor = {perturbationFactor:f}f0
133
  const annealing = {"true" if annealing else "false"}
134
+ const weighted = {"true" if weights is not None else "false"}
135
  const mutationWeights = [
136
  {weightMutateConstant:f},
137
  {weightMutateOperator:f},
 
146
 
147
  assert len(X.shape) == 2
148
  assert len(y.shape) == 1
149
+ assert X.shape[0] == y.shape[0]
150
+ if weights is not None:
151
+ assert len(weights.shape) == 1
152
+ assert X.shape[0] == weights.shape[0]
153
 
154
  X_str = str(X.tolist()).replace('],', '];').replace(',', '')
155
  y_str = str(y.tolist())
156
 
157
  def_datasets = """const X = convert(Array{Float32, 2}, """f"{X_str})""""
158
+ const y = convert(Array{Float32, 1}, """f"{y_str})"
159
+
160
+ if weights is not None:
161
+ weight_str = str(weights.tolist())
162
+ def_datasets += """
163
+ const weights = convert(Array{Float32, 1}, """f"{weight_str})"
164
 
165
  with open(f'/tmp/.hyperparams_{rand_string}.jl', 'w') as f:
166
  print(def_hyperparams, file=f)