Spaces:
Running
Running
MilesCranmer
commited on
Commit
·
964082a
1
Parent(s):
3662fae
Add feature selection based on gradient boosting
Browse files- pysr/sr.py +35 -4
pysr/sr.py
CHANGED
@@ -76,6 +76,7 @@ def pysr(X=None, y=None, weights=None,
|
|
76 |
fast_cycle=False,
|
77 |
maxdepth=None,
|
78 |
variable_names=[],
|
|
|
79 |
threads=None, #deprecated
|
80 |
julia_optimization=3,
|
81 |
):
|
@@ -140,6 +141,11 @@ def pysr(X=None, y=None, weights=None,
|
|
140 |
15% faster. May be algorithmically less efficient.
|
141 |
:param variable_names: list, a list of names for the variables, other
|
142 |
than "x0", "x1", etc.
|
|
|
|
|
|
|
|
|
|
|
143 |
:param julia_optimization: int, Optimization level (0, 1, 2, 3)
|
144 |
:returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
|
145 |
(as strings).
|
@@ -154,6 +160,8 @@ def pysr(X=None, y=None, weights=None,
|
|
154 |
variable_names = list(X.columns)
|
155 |
X = np.array(X)
|
156 |
|
|
|
|
|
157 |
# Check for potential errors before they happen
|
158 |
assert len(unary_operators) + len(binary_operators) > 0
|
159 |
assert len(X.shape) == 2
|
@@ -162,9 +170,17 @@ def pysr(X=None, y=None, weights=None,
|
|
162 |
if weights is not None:
|
163 |
assert len(weights.shape) == 1
|
164 |
assert X.shape[0] == weights.shape[0]
|
165 |
-
if
|
166 |
assert len(variable_names) == X.shape[1]
|
167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
if populations is None:
|
169 |
populations = procs
|
170 |
|
@@ -233,7 +249,7 @@ const nrestarts = {nrestarts:d}
|
|
233 |
const perturbationFactor = {perturbationFactor:f}f0
|
234 |
const annealing = {"true" if annealing else "false"}
|
235 |
const weighted = {"true" if weights is not None else "false"}
|
236 |
-
const useVarMap = {"
|
237 |
const mutationWeights = [
|
238 |
{weightMutateConstant:f},
|
239 |
{weightMutateOperator:f},
|
@@ -260,7 +276,7 @@ const y = convert(Array{Float32, 1}, """f"{y_str})"
|
|
260 |
def_datasets += """
|
261 |
const weights = convert(Array{Float32, 1}, """f"{weight_str})"
|
262 |
|
263 |
-
if
|
264 |
def_hyperparams += f"""
|
265 |
const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
266 |
|
@@ -299,7 +315,7 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
299 |
lastComplexity = 0
|
300 |
sympy_format = []
|
301 |
lambda_format = []
|
302 |
-
if
|
303 |
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
|
304 |
else:
|
305 |
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
|
@@ -326,3 +342,18 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
326 |
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
327 |
|
328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
fast_cycle=False,
|
77 |
maxdepth=None,
|
78 |
variable_names=[],
|
79 |
+
select_k_features=None,
|
80 |
threads=None, #deprecated
|
81 |
julia_optimization=3,
|
82 |
):
|
|
|
141 |
15% faster. May be algorithmically less efficient.
|
142 |
:param variable_names: list, a list of names for the variables, other
|
143 |
than "x0", "x1", etc.
|
144 |
+
:param feature_selection: bool,
|
145 |
+
:param select_k_features: (None, int), whether to run feature selection in
|
146 |
+
Python using random forests, before passing to the symbolic regression
|
147 |
+
code. None means no feature selection; an int means select that many
|
148 |
+
features.
|
149 |
:param julia_optimization: int, Optimization level (0, 1, 2, 3)
|
150 |
:returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
|
151 |
(as strings).
|
|
|
160 |
variable_names = list(X.columns)
|
161 |
X = np.array(X)
|
162 |
|
163 |
+
use_custom_variable_names = (len(variable_names) != 0)
|
164 |
+
|
165 |
# Check for potential errors before they happen
|
166 |
assert len(unary_operators) + len(binary_operators) > 0
|
167 |
assert len(X.shape) == 2
|
|
|
170 |
if weights is not None:
|
171 |
assert len(weights.shape) == 1
|
172 |
assert X.shape[0] == weights.shape[0]
|
173 |
+
if use_custom_variable_names:
|
174 |
assert len(variable_names) == X.shape[1]
|
175 |
|
176 |
+
if select_k_features is not None:
|
177 |
+
selection = run_feature_selection(X, y, select_k_features)
|
178 |
+
print(f"Using features {selection}")
|
179 |
+
X = X[:, selection]
|
180 |
+
|
181 |
+
if use_custom_variable_names:
|
182 |
+
variable_names = variable_names[selection]
|
183 |
+
|
184 |
if populations is None:
|
185 |
populations = procs
|
186 |
|
|
|
249 |
const perturbationFactor = {perturbationFactor:f}f0
|
250 |
const annealing = {"true" if annealing else "false"}
|
251 |
const weighted = {"true" if weights is not None else "false"}
|
252 |
+
const useVarMap = {"true" if use_custom_variable_names else "false"}
|
253 |
const mutationWeights = [
|
254 |
{weightMutateConstant:f},
|
255 |
{weightMutateOperator:f},
|
|
|
276 |
def_datasets += """
|
277 |
const weights = convert(Array{Float32, 1}, """f"{weight_str})"
|
278 |
|
279 |
+
if use_custom_variable_names:
|
280 |
def_hyperparams += f"""
|
281 |
const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
282 |
|
|
|
315 |
lastComplexity = 0
|
316 |
sympy_format = []
|
317 |
lambda_format = []
|
318 |
+
if use_custom_variable_names:
|
319 |
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
|
320 |
else:
|
321 |
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
|
|
|
342 |
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
343 |
|
344 |
|
345 |
+
def run_feature_selection(X, y, select_k_features):
|
346 |
+
"""Use a gradient boosting tree regressor as a proxy for finding
|
347 |
+
the k most important features in X, returning indices for those
|
348 |
+
features as output."""
|
349 |
+
|
350 |
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
351 |
+
from sklearn.feature_selection import SelectFromModel, SelectKBest
|
352 |
+
|
353 |
+
clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
|
354 |
+
clf.fit(X, y)
|
355 |
+
selector = SelectFromModel(clf, threshold=-np.inf,
|
356 |
+
max_features=select_k_features, prefit=True)
|
357 |
+
return selector.get_support(indices=True)
|
358 |
+
|
359 |
+
|