MilesCranmer commited on
Commit
964082a
1 Parent(s): 3662fae

Add feature selection based on gradient boosting

Browse files
Files changed (1) hide show
  1. pysr/sr.py +35 -4
pysr/sr.py CHANGED
@@ -76,6 +76,7 @@ def pysr(X=None, y=None, weights=None,
76
  fast_cycle=False,
77
  maxdepth=None,
78
  variable_names=[],
 
79
  threads=None, #deprecated
80
  julia_optimization=3,
81
  ):
@@ -140,6 +141,11 @@ def pysr(X=None, y=None, weights=None,
140
  15% faster. May be algorithmically less efficient.
141
  :param variable_names: list, a list of names for the variables, other
142
  than "x0", "x1", etc.
 
 
 
 
 
143
  :param julia_optimization: int, Optimization level (0, 1, 2, 3)
144
  :returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
145
  (as strings).
@@ -154,6 +160,8 @@ def pysr(X=None, y=None, weights=None,
154
  variable_names = list(X.columns)
155
  X = np.array(X)
156
 
 
 
157
  # Check for potential errors before they happen
158
  assert len(unary_operators) + len(binary_operators) > 0
159
  assert len(X.shape) == 2
@@ -162,9 +170,17 @@ def pysr(X=None, y=None, weights=None,
162
  if weights is not None:
163
  assert len(weights.shape) == 1
164
  assert X.shape[0] == weights.shape[0]
165
- if len(variable_names) != 0:
166
  assert len(variable_names) == X.shape[1]
167
 
 
 
 
 
 
 
 
 
168
  if populations is None:
169
  populations = procs
170
 
@@ -233,7 +249,7 @@ const nrestarts = {nrestarts:d}
233
  const perturbationFactor = {perturbationFactor:f}f0
234
  const annealing = {"true" if annealing else "false"}
235
  const weighted = {"true" if weights is not None else "false"}
236
- const useVarMap = {"false" if len(variable_names) == 0 else "true"}
237
  const mutationWeights = [
238
  {weightMutateConstant:f},
239
  {weightMutateOperator:f},
@@ -260,7 +276,7 @@ const y = convert(Array{Float32, 1}, """f"{y_str})"
260
  def_datasets += """
261
  const weights = convert(Array{Float32, 1}, """f"{weight_str})"
262
 
263
- if len(variable_names) != 0:
264
  def_hyperparams += f"""
265
  const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
266
 
@@ -299,7 +315,7 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
299
  lastComplexity = 0
300
  sympy_format = []
301
  lambda_format = []
302
- if len(variable_names) != 0:
303
  sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
304
  else:
305
  sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
@@ -326,3 +342,18 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
326
  return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
327
 
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  fast_cycle=False,
77
  maxdepth=None,
78
  variable_names=[],
79
+ select_k_features=None,
80
  threads=None, #deprecated
81
  julia_optimization=3,
82
  ):
 
141
  15% faster. May be algorithmically less efficient.
142
  :param variable_names: list, a list of names for the variables, other
143
  than "x0", "x1", etc.
144
+ :param feature_selection: bool,
145
+ :param select_k_features: (None, int), whether to run feature selection in
146
+ Python using random forests, before passing to the symbolic regression
147
+ code. None means no feature selection; an int means select that many
148
+ features.
149
  :param julia_optimization: int, Optimization level (0, 1, 2, 3)
150
  :returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
151
  (as strings).
 
160
  variable_names = list(X.columns)
161
  X = np.array(X)
162
 
163
+ use_custom_variable_names = (len(variable_names) != 0)
164
+
165
  # Check for potential errors before they happen
166
  assert len(unary_operators) + len(binary_operators) > 0
167
  assert len(X.shape) == 2
 
170
  if weights is not None:
171
  assert len(weights.shape) == 1
172
  assert X.shape[0] == weights.shape[0]
173
+ if use_custom_variable_names:
174
  assert len(variable_names) == X.shape[1]
175
 
176
+ if select_k_features is not None:
177
+ selection = run_feature_selection(X, y, select_k_features)
178
+ print(f"Using features {selection}")
179
+ X = X[:, selection]
180
+
181
+ if use_custom_variable_names:
182
+ variable_names = variable_names[selection]
183
+
184
  if populations is None:
185
  populations = procs
186
 
 
249
  const perturbationFactor = {perturbationFactor:f}f0
250
  const annealing = {"true" if annealing else "false"}
251
  const weighted = {"true" if weights is not None else "false"}
252
+ const useVarMap = {"true" if use_custom_variable_names else "false"}
253
  const mutationWeights = [
254
  {weightMutateConstant:f},
255
  {weightMutateOperator:f},
 
276
  def_datasets += """
277
  const weights = convert(Array{Float32, 1}, """f"{weight_str})"
278
 
279
+ if use_custom_variable_names:
280
  def_hyperparams += f"""
281
  const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
282
 
 
315
  lastComplexity = 0
316
  sympy_format = []
317
  lambda_format = []
318
+ if use_custom_variable_names:
319
  sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
320
  else:
321
  sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
 
342
  return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
343
 
344
 
345
+ def run_feature_selection(X, y, select_k_features):
346
+ """Use a gradient boosting tree regressor as a proxy for finding
347
+ the k most important features in X, returning indices for those
348
+ features as output."""
349
+
350
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
351
+ from sklearn.feature_selection import SelectFromModel, SelectKBest
352
+
353
+ clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
354
+ clf.fit(X, y)
355
+ selector = SelectFromModel(clf, threshold=-np.inf,
356
+ max_features=select_k_features, prefit=True)
357
+ return selector.get_support(indices=True)
358
+
359
+