Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on May 30, 2021

Commit

0087907

unverified ·

2 Parent(s): 569f4ba 14dfd82

Merge pull request #46 from MilesCranmer/multi-output

Browse files

Files changed (3) hide show

Project.toml +1 -1
pysr/sr.py +123 -61
test/test.py +4 -3

Project.toml CHANGED Viewed

@@ -2,5 +2,5 @@
 SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
 [compat]
-SymbolicRegression = "0.5.16"
 julia = "1.5"

 SymbolicRegression = "8254be44-1295-4e6a-a16d-46603ac705cb"
 [compat]
+SymbolicRegression = "0.6.0"
 julia = "1.5"

pysr/sr.py CHANGED Viewed

@@ -19,6 +19,8 @@ global_equation_file = 'hall_of_fame.csv'
 global_n_features = None
 global_variable_names = []
 global_extra_sympy_mappings = {}
 sympy_mappings = {
     'div':  lambda x, y : x/y,
@@ -276,6 +278,16 @@ def pysr(X=None, y=None, weights=None,
     if X is None:
         X, y = _using_test_input(X, test, y)
     kwargs = dict(X=X, y=y, weights=weights,
                  alpha=alpha, annealing=annealing, batchSize=batchSize,
                  batching=batching, binary_operators=binary_operators,
@@ -309,7 +321,8 @@ def pysr(X=None, y=None, weights=None,
                  constraints=constraints,
                  extra_sympy_mappings=extra_sympy_mappings,
                  julia_project=julia_project, loss=loss,
-                 output_jax_format=output_jax_format)
     kwargs = {**_set_paths(tempdir), **kwargs}
@@ -358,15 +371,20 @@ def pysr(X=None, y=None, weights=None,
-def _set_globals(X, equation_file, extra_sympy_mappings, variable_names, **kwargs):
     global global_n_features
     global global_equation_file
     global global_variable_names
     global global_extra_sympy_mappings
     global_n_features = X.shape[1]
     global_equation_file = equation_file
     global_variable_names = variable_names
     global_extra_sympy_mappings = extra_sympy_mappings
 def _final_pysr_process(julia_optimization, runfile_filename, timeout, **kwargs):
@@ -393,9 +411,7 @@ def _cmd_runner(command, **kwargs):
                                 .replace('\\r',      '\r')
                                 .encode(sys.stdout.encoding, errors='replace'))
-            print(decoded_line, end='')
         process.stdout.close()
         process.wait()
@@ -438,17 +454,35 @@ def _create_julia_files(dataset_filename, def_datasets,  hyperparam_filename, de
             print(f'EquationSearch(X, y, niterations={niterations:d}, varMap={varMap}, options=options, numprocs={procs})', file=f)
-def _make_datasets_julia_str(X, X_filename, weights, weights_filename, y, y_filename, **kwargs):
     def_datasets = """using DelimitedFiles"""
     np.savetxt(X_filename, X.astype(np.float32), delimiter=',')
-    np.savetxt(y_filename, y.reshape(-1, 1).astype(np.float32), delimiter=',')
     if weights is not None:
-        np.savetxt(weights_filename, weights.reshape(-1, 1), delimiter=',')
     def_datasets += f"""
-X = copy(transpose(readdlm("{_escape_filename(X_filename)}", ',', Float32, '\\n')))
 y = readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')[:, 1]"""
     if weights is not None:
-        def_datasets += f"""
 weights = readdlm("{_escape_filename(weights_filename)}", ',', Float32, '\\n')[:, 1]"""
     return def_datasets
@@ -656,10 +690,10 @@ def _check_assertions(X, binary_operators, unary_operators, use_custom_variable_
     # Check for potential errors before they happen
     assert len(unary_operators) + len(binary_operators) > 0
     assert len(X.shape) == 2
-    assert len(y.shape) == 1
     assert X.shape[0] == y.shape[0]
     if weights is not None:
-        assert len(weights.shape) == 1
         assert X.shape[0] == weights.shape[0]
     if use_custom_variable_names:
         assert len(variable_names) == X.shape[1]
@@ -693,7 +727,8 @@ def run_feature_selection(X, y, select_k_features):
     return selector.get_support(indices=True)
 def get_hof(equation_file=None, n_features=None, variable_names=None,
-            extra_sympy_mappings=None, output_jax_format=False, **kwargs):
     """Get the equations from a hall of fame file. If no arguments
     entered, the ones used previously from a call to PySR will be used."""
@@ -701,99 +736,126 @@ def get_hof(equation_file=None, n_features=None, variable_names=None,
     global global_equation_file
     global global_variable_names
     global global_extra_sympy_mappings
     if equation_file is None: equation_file = global_equation_file
     if n_features is None: n_features = global_n_features
     if variable_names is None: variable_names = global_variable_names
     if extra_sympy_mappings is None: extra_sympy_mappings = global_extra_sympy_mappings
     global_equation_file = equation_file
     global_n_features = n_features
     global_variable_names = variable_names
     global_extra_sympy_mappings = extra_sympy_mappings
     try:
-        output = pd.read_csv(str(equation_file) + '.bkup', sep="|")
     except FileNotFoundError:
         raise RuntimeError("Couldn't find equation file! The equation search likely exited before a single iteration completed.")
-    scores = []
-    lastMSE = None
-    lastComplexity = 0
-    sympy_format = []
-    lambda_format = []
-    if output_jax_format:
-        jax_format = []
-    use_custom_variable_names = (len(variable_names) != 0)
-    local_sympy_mappings = {
-            **extra_sympy_mappings,
-            **sympy_mappings
-    }
-    if use_custom_variable_names:
-        sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
-    else:
-        sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(n_features)]
-    for i in range(len(output)):
-        eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
-        sympy_format.append(eqn)
         if output_jax_format:
-            func, params = sympy2jax(eqn, sympy_symbols)
-            jax_format.append({'callable': func, 'parameters': params})
-        lambda_format.append(lambdify(sympy_symbols, eqn))
-        curMSE = output.loc[i, 'MSE']
-        curComplexity = output.loc[i, 'Complexity']
-        if lastMSE is None:
-            cur_score = 0.0
-        else:
-            cur_score = - np.log(curMSE/lastMSE)/(curComplexity - lastComplexity)
-        scores.append(cur_score)
-        lastMSE = curMSE
-        lastComplexity = curComplexity
-    output['score'] = np.array(scores)
-    output['sympy_format'] = sympy_format
-    output['lambda_format'] = lambda_format
-    output_cols = ['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']
-    if output_jax_format:
-        output_cols += ['jax_format']
-        output['jax_format'] = jax_format
-    return output[output_cols]
 def best_row(equations=None):
     """Return the best row of a hall of fame file using the score column.
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
-    best_idx = np.argmax(equations['score'])
-    return equations.iloc[best_idx]
 def best_tex(equations=None):
     """Return the equation with the best score, in latex format
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
-    best_sympy = best_row(equations)['sympy_format']
-    return sympy.latex(best_sympy.simplify())
 def best(equations=None):
     """Return the equation with the best score, in sympy format.
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
-    best_sympy = best_row(equations)['sympy_format']
-    return best_sympy.simplify()
 def best_callable(equations=None):
     """Return the equation with the best score, in callable format.
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
-    return best_row(equations)['lambda_format']
 def _escape_filename(filename):
     """Turns a file into a string representation with correctly escaped backslashes"""

 global_n_features = None
 global_variable_names = []
 global_extra_sympy_mappings = {}
+global_multioutput = False
+global_nout = 1
 sympy_mappings = {
     'div':  lambda x, y : x/y,
     if X is None:
         X, y = _using_test_input(X, test, y)
+    if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
+        multioutput = False
+        nout = 1
+        y = y.reshape(-1)
+    elif len(y.shape) == 2:
+        multioutput = True
+        nout = y.shape[1]
+    else:
+        raise NotImplementedError("y shape not supported!")
     kwargs = dict(X=X, y=y, weights=weights,
                  alpha=alpha, annealing=annealing, batchSize=batchSize,
                  batching=batching, binary_operators=binary_operators,
                  constraints=constraints,
                  extra_sympy_mappings=extra_sympy_mappings,
                  julia_project=julia_project, loss=loss,
+                 output_jax_format=output_jax_format,
+                 multioutput=multioutput, nout=nout)
     kwargs = {**_set_paths(tempdir), **kwargs}
+def _set_globals(X, equation_file, extra_sympy_mappings, variable_names,
+                multioutput, nout, **kwargs):
     global global_n_features
     global global_equation_file
     global global_variable_names
     global global_extra_sympy_mappings
+    global global_multioutput
+    global global_nout
     global_n_features = X.shape[1]
     global_equation_file = equation_file
     global_variable_names = variable_names
     global_extra_sympy_mappings = extra_sympy_mappings
+    global_multioutput = multioutput
+    global_nout = nout
 def _final_pysr_process(julia_optimization, runfile_filename, timeout, **kwargs):
                                 .replace('\\r',      '\r')
                                 .encode(sys.stdout.encoding, errors='replace'))
+            sys.stdout.buffer.write(decoded_line)
         process.stdout.close()
         process.wait()
             print(f'EquationSearch(X, y, niterations={niterations:d}, varMap={varMap}, options=options, numprocs={procs})', file=f)
+def _make_datasets_julia_str(X, X_filename, weights, weights_filename, y, y_filename,
+                            multioutput, **kwargs):
     def_datasets = """using DelimitedFiles"""
     np.savetxt(X_filename, X.astype(np.float32), delimiter=',')
+    if multioutput:
+        np.savetxt(y_filename, y.astype(np.float32), delimiter=',')
+    else:
+        np.savetxt(y_filename, y.reshape(-1, 1).astype(np.float32), delimiter=',')
     if weights is not None:
+        if multioutput:
+            np.savetxt(weights_filename, weights.astype(np.float32), delimiter=',')
+        else:
+            np.savetxt(weights_filename, weights.reshape(-1, 1).astype(np.float32), delimiter=',')
     def_datasets += f"""
+X = copy(transpose(readdlm("{_escape_filename(X_filename)}", ',', Float32, '\\n')))"""
+    if multioutput:
+        def_datasets+= f"""
+y = copy(transpose(readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')))"""
+    else:
+        def_datasets+= f"""
 y = readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')[:, 1]"""
     if weights is not None:
+        if multioutput:
+            def_datasets += f"""
+weights = copy(transpose(readdlm("{_escape_filename(weights_filename)}", ',', Float32, '\\n')))"""
+        else:
+            def_datasets += f"""
 weights = readdlm("{_escape_filename(weights_filename)}", ',', Float32, '\\n')[:, 1]"""
     return def_datasets
     # Check for potential errors before they happen
     assert len(unary_operators) + len(binary_operators) > 0
     assert len(X.shape) == 2
+    assert len(y.shape) in [1, 2]
     assert X.shape[0] == y.shape[0]
     if weights is not None:
+        assert weights.shape == y.shape
         assert X.shape[0] == weights.shape[0]
     if use_custom_variable_names:
         assert len(variable_names) == X.shape[1]
     return selector.get_support(indices=True)
 def get_hof(equation_file=None, n_features=None, variable_names=None,
+            extra_sympy_mappings=None, output_jax_format=False,
+            multioutput=None, nout=None, **kwargs):
     """Get the equations from a hall of fame file. If no arguments
     entered, the ones used previously from a call to PySR will be used."""
     global global_equation_file
     global global_variable_names
     global global_extra_sympy_mappings
+    global global_multioutput
+    global global_nout
     if equation_file is None: equation_file = global_equation_file
     if n_features is None: n_features = global_n_features
     if variable_names is None: variable_names = global_variable_names
     if extra_sympy_mappings is None: extra_sympy_mappings = global_extra_sympy_mappings
+    if multioutput is None: multioutput = global_multioutput
+    if nout is None: nout = global_nout
     global_equation_file = equation_file
     global_n_features = n_features
     global_variable_names = variable_names
     global_extra_sympy_mappings = extra_sympy_mappings
+    global_multioutput = multioutput
+    global_nout = nout
     try:
+        if multioutput:
+            all_outputs = [pd.read_csv(f'out{i}_' + str(equation_file) + '.bkup', sep="|") for i in range(1, nout+1)]
+        else:
+            all_outputs = [pd.read_csv(str(equation_file) + '.bkup', sep="|")]
     except FileNotFoundError:
         raise RuntimeError("Couldn't find equation file! The equation search likely exited before a single iteration completed.")
+    ret_outputs = []
+    for output in all_outputs:
+        scores = []
+        lastMSE = None
+        lastComplexity = 0
+        sympy_format = []
+        lambda_format = []
         if output_jax_format:
+            jax_format = []
+        use_custom_variable_names = (len(variable_names) != 0)
+        local_sympy_mappings = {
+                **extra_sympy_mappings,
+                **sympy_mappings
+        }
+        if use_custom_variable_names:
+            sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
+        else:
+            sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(n_features)]
+        for i in range(len(output)):
+            eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
+            sympy_format.append(eqn)
+            if output_jax_format:
+                func, params = sympy2jax(eqn, sympy_symbols)
+                jax_format.append({'callable': func, 'parameters': params})
+            lambda_format.append(lambdify(sympy_symbols, eqn))
+            curMSE = output.loc[i, 'MSE']
+            curComplexity = output.loc[i, 'Complexity']
+            if lastMSE is None:
+                cur_score = 0.0
+            else:
+                cur_score = - np.log(curMSE/lastMSE)/(curComplexity - lastComplexity)
+            scores.append(cur_score)
+            lastMSE = curMSE
+            lastComplexity = curComplexity
+        output['score'] = np.array(scores)
+        output['sympy_format'] = sympy_format
+        output['lambda_format'] = lambda_format
+        output_cols = ['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']
+        if output_jax_format:
+            output_cols += ['jax_format']
+            output['jax_format'] = jax_format
+        ret_outputs.append(output[output_cols])
+    if multioutput:
+        return ret_outputs
+    else:
+        return ret_outputs[0]
 def best_row(equations=None):
     """Return the best row of a hall of fame file using the score column.
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
+    if isinstance(equations, list):
+        return [eq.iloc[np.argmax(eq['score'])] for eq in equations]
+    else:
+        return equations.iloc[np.argmax(equations['score'])]
 def best_tex(equations=None):
     """Return the equation with the best score, in latex format
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
+    if isinstance(equations, list):
+        return [sympy.latex(best_row(eq)['sympy_format'].simplify()) for eq in equations]
+    else:
+        return sympy.latex(best_row(equations)['sympy_format'].simplify())
 def best(equations=None):
     """Return the equation with the best score, in sympy format.
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
+    if isinstance(equations, list):
+        return [best_row(eq)['sympy_format'].simplify() for eq in equations]
+    else:
+        return best_row(equations)['sympy_format'].simplify()
 def best_callable(equations=None):
     """Return the equation with the best score, in callable format.
     By default this uses the last equation file.
     """
     if equations is None: equations = get_hof()
+    if isinstance(equations, list):
+        return [best_row(eq)['lambda_format'] for eq in equations]
+    else:
+        return best_row(equations)['lambda_format']
 def _escape_filename(filename):
     """Turns a file into a string representation with correctly escaped backslashes"""

test/test.py CHANGED Viewed

@@ -17,14 +17,15 @@ equations = pysr(X, y, **default_test_kwargs)
 print(equations)
 assert equations.iloc[-1]['MSE'] < 1e-4
-print("Test 2 - test custom operator")
-y = X[:, 0]**2
 equations = pysr(X, y,
                  unary_operators=["sq(x) = x^2"], binary_operators=["plus"],
                  extra_sympy_mappings={'square': lambda x: x**2},
                  **default_test_kwargs)
 print(equations)
-assert equations.iloc[-1]['MSE'] < 1e-4
 X = np.random.randn(100, 1)
 y = X[:, 0] + 3.0

 print(equations)
 assert equations.iloc[-1]['MSE'] < 1e-4
+print("Test 2 - test custom operator, and multiple outputs")
+y = X[:, [0, 1]]**2
 equations = pysr(X, y,
                  unary_operators=["sq(x) = x^2"], binary_operators=["plus"],
                  extra_sympy_mappings={'square': lambda x: x**2},
                  **default_test_kwargs)
 print(equations)
+assert equations[0].iloc[-1]['MSE'] < 1e-4
+assert equations[1].iloc[-1]['MSE'] < 1e-4
 X = np.random.randn(100, 1)
 y = X[:, 0] + 3.0