Spaces:

MilesCranmer
/

PySR

Sleeping

App Files Files Community

MilesCranmer commited on Oct 9, 2020

Commit

964082a

1 Parent(s): 3662fae

Add feature selection based on gradient boosting

Browse files

Files changed (1) hide show

pysr/sr.py +35 -4

pysr/sr.py CHANGED Viewed

@@ -76,6 +76,7 @@ def pysr(X=None, y=None, weights=None,
             fast_cycle=False,
             maxdepth=None,
             variable_names=[],
             threads=None, #deprecated
             julia_optimization=3,
         ):
@@ -140,6 +141,11 @@ def pysr(X=None, y=None, weights=None,
         15% faster. May be algorithmically less efficient.
     :param variable_names: list, a list of names for the variables, other
         than "x0", "x1", etc.
     :param julia_optimization: int, Optimization level (0, 1, 2, 3)
     :returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
         (as strings).
@@ -154,6 +160,8 @@ def pysr(X=None, y=None, weights=None,
         variable_names = list(X.columns)
         X = np.array(X)
     # Check for potential errors before they happen
     assert len(unary_operators) + len(binary_operators) > 0
     assert len(X.shape) == 2
@@ -162,9 +170,17 @@ def pysr(X=None, y=None, weights=None,
     if weights is not None:
         assert len(weights.shape) == 1
         assert X.shape[0] == weights.shape[0]
-    if len(variable_names) != 0:
         assert len(variable_names) == X.shape[1]
     if populations is None:
         populations = procs
@@ -233,7 +249,7 @@ const nrestarts = {nrestarts:d}
 const perturbationFactor = {perturbationFactor:f}f0
 const annealing = {"true" if annealing else "false"}
 const weighted = {"true" if weights is not None else "false"}
-const useVarMap = {"false" if len(variable_names) == 0 else "true"}
 const mutationWeights = [
     {weightMutateConstant:f},
     {weightMutateOperator:f},
@@ -260,7 +276,7 @@ const y = convert(Array{Float32, 1}, """f"{y_str})"
         def_datasets += """
 const weights = convert(Array{Float32, 1}, """f"{weight_str})"
-    if len(variable_names) != 0:
         def_hyperparams += f"""
 const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
@@ -299,7 +315,7 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
     lastComplexity = 0
     sympy_format = []
     lambda_format = []
-    if len(variable_names) != 0:
         sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
     else:
         sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
@@ -326,3 +342,18 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
     return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]

             fast_cycle=False,
             maxdepth=None,
             variable_names=[],
+            select_k_features=None,
             threads=None, #deprecated
             julia_optimization=3,
         ):
         15% faster. May be algorithmically less efficient.
     :param variable_names: list, a list of names for the variables, other
         than "x0", "x1", etc.
+    :param feature_selection: bool,
+    :param select_k_features: (None, int), whether to run feature selection in
+        Python using random forests, before passing to the symbolic regression
+        code. None means no feature selection; an int means select that many
+        features.
     :param julia_optimization: int, Optimization level (0, 1, 2, 3)
     :returns: pd.DataFrame, Results dataframe, giving complexity, MSE, and equations
         (as strings).
         variable_names = list(X.columns)
         X = np.array(X)
+    use_custom_variable_names = (len(variable_names) != 0)
     # Check for potential errors before they happen
     assert len(unary_operators) + len(binary_operators) > 0
     assert len(X.shape) == 2
     if weights is not None:
         assert len(weights.shape) == 1
         assert X.shape[0] == weights.shape[0]
+    if use_custom_variable_names:
         assert len(variable_names) == X.shape[1]
+    if select_k_features is not None:
+        selection = run_feature_selection(X, y, select_k_features)
+        print(f"Using features {selection}")
+        X = X[:, selection]
+        if use_custom_variable_names:
+            variable_names = variable_names[selection]
     if populations is None:
         populations = procs
 const perturbationFactor = {perturbationFactor:f}f0
 const annealing = {"true" if annealing else "false"}
 const weighted = {"true" if weights is not None else "false"}
+const useVarMap = {"true" if use_custom_variable_names else "false"}
 const mutationWeights = [
     {weightMutateConstant:f},
     {weightMutateOperator:f},
         def_datasets += """
 const weights = convert(Array{Float32, 1}, """f"{weight_str})"
+    if use_custom_variable_names:
         def_hyperparams += f"""
 const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
     lastComplexity = 0
     sympy_format = []
     lambda_format = []
+    if use_custom_variable_names:
         sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
     else:
         sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
     return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
+def run_feature_selection(X, y, select_k_features):
+    """Use a gradient boosting tree regressor as a proxy for finding
+        the k most important features in X, returning indices for those
+        features as output."""
+    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
+    from sklearn.feature_selection import SelectFromModel, SelectKBest
+    clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
+    clf.fit(X, y)
+    selector = SelectFromModel(clf, threshold=-np.inf,
+            max_features=select_k_features, prefit=True)
+    return selector.get_support(indices=True)