Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
•
bf37f2a
1
Parent(s):
ac2e8e0
Allow early quit + can pretty-print best equation
Browse files- pysr/__init__.py +1 -1
- pysr/sr.py +103 -34
pysr/__init__.py
CHANGED
@@ -1 +1 @@
|
|
1 |
-
from .sr import pysr
|
|
|
1 |
+
from .sr import pysr, get_hof, best, best_tex, best_function
|
pysr/sr.py
CHANGED
@@ -6,13 +6,19 @@ import numpy as np
|
|
6 |
import pandas as pd
|
7 |
import sympy
|
8 |
from sympy import sympify, Symbol, lambdify
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
sympy_mappings = {
|
11 |
'div': lambda x, y : x/y,
|
12 |
'mult': lambda x, y : x*y,
|
13 |
'plus': lambda x, y : x + y,
|
14 |
'neg': lambda x : -x,
|
15 |
-
'pow': lambda x, y : sympy.sign(x)*
|
16 |
'cos': lambda x : sympy.cos(x),
|
17 |
'sin': lambda x : sympy.sin(x),
|
18 |
'tan': lambda x : sympy.tan(x),
|
@@ -26,13 +32,13 @@ sympy_mappings = {
|
|
26 |
'acosh':lambda x : sympy.acosh(x),
|
27 |
'asinh':lambda x : sympy.asinh(x),
|
28 |
'atanh':lambda x : sympy.atanh(x),
|
29 |
-
'abs': lambda x :
|
30 |
'mod': lambda x, y : sympy.Mod(x, y),
|
31 |
'erf': lambda x : sympy.erf(x),
|
32 |
'erfc': lambda x : sympy.erfc(x),
|
33 |
-
'logm': lambda x : sympy.log(
|
34 |
-
'logm10':lambda x : sympy.log10(
|
35 |
-
'logm2': lambda x : sympy.log2(
|
36 |
'log1p': lambda x : sympy.log(x + 1),
|
37 |
'floor': lambda x : sympy.floor(x),
|
38 |
'ceil': lambda x : sympy.ceil(x),
|
@@ -189,11 +195,6 @@ def pysr(X=None, y=None, weights=None,
|
|
189 |
if populations is None:
|
190 |
populations = procs
|
191 |
|
192 |
-
local_sympy_mappings = {
|
193 |
-
**extra_sympy_mappings,
|
194 |
-
**sympy_mappings
|
195 |
-
}
|
196 |
-
|
197 |
rand_string = f'{"".join([str(np.random.rand())[2] for i in range(20)])}'
|
198 |
|
199 |
if isinstance(binary_operators, str): binary_operators = [binary_operators]
|
@@ -302,17 +303,64 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
302 |
|
303 |
|
304 |
command = [
|
305 |
-
f'julia -O{julia_optimization:d}',
|
306 |
-
f'-p {procs}',
|
307 |
f'/tmp/.runfile_{rand_string}.jl',
|
308 |
]
|
309 |
if timeout is not None:
|
310 |
-
command = [f'timeout {timeout}'] + command
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
try:
|
315 |
-
output = pd.read_csv(equation_file, sep="|")
|
316 |
except FileNotFoundError:
|
317 |
print("Couldn't find equation file!")
|
318 |
return pd.DataFrame()
|
@@ -322,10 +370,17 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
322 |
lastComplexity = 0
|
323 |
sympy_format = []
|
324 |
lambda_format = []
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
if use_custom_variable_names:
|
326 |
-
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(
|
327 |
else:
|
328 |
-
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(
|
|
|
329 |
for i in range(len(output)):
|
330 |
eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
|
331 |
sympy_format.append(eqn)
|
@@ -342,25 +397,39 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
342 |
lastMSE = curMSE
|
343 |
lastComplexity = curComplexity
|
344 |
|
345 |
-
|
346 |
output['score'] = np.array(scores)
|
347 |
output['sympy_format'] = sympy_format
|
348 |
output['lambda_format'] = lambda_format
|
349 |
-
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
350 |
-
|
351 |
|
352 |
-
|
353 |
-
"""Use a gradient boosting tree regressor as a proxy for finding
|
354 |
-
the k most important features in X, returning indices for those
|
355 |
-
features as output."""
|
356 |
-
|
357 |
-
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
358 |
-
from sklearn.feature_selection import SelectFromModel, SelectKBest
|
359 |
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
|
|
|
6 |
import pandas as pd
|
7 |
import sympy
|
8 |
from sympy import sympify, Symbol, lambdify
|
9 |
+
import subprocess
|
10 |
+
|
11 |
+
global_equation_file = 'hall_of_fame.csv'
|
12 |
+
global_n_features = None
|
13 |
+
global_variable_names = []
|
14 |
+
global_extra_sympy_mappings = {}
|
15 |
|
16 |
sympy_mappings = {
|
17 |
'div': lambda x, y : x/y,
|
18 |
'mult': lambda x, y : x*y,
|
19 |
'plus': lambda x, y : x + y,
|
20 |
'neg': lambda x : -x,
|
21 |
+
'pow': lambda x, y : sympy.sign(x)*abs(x)**y,
|
22 |
'cos': lambda x : sympy.cos(x),
|
23 |
'sin': lambda x : sympy.sin(x),
|
24 |
'tan': lambda x : sympy.tan(x),
|
|
|
32 |
'acosh':lambda x : sympy.acosh(x),
|
33 |
'asinh':lambda x : sympy.asinh(x),
|
34 |
'atanh':lambda x : sympy.atanh(x),
|
35 |
+
'abs': lambda x : abs(x),
|
36 |
'mod': lambda x, y : sympy.Mod(x, y),
|
37 |
'erf': lambda x : sympy.erf(x),
|
38 |
'erfc': lambda x : sympy.erfc(x),
|
39 |
+
'logm': lambda x : sympy.log(abs(x)),
|
40 |
+
'logm10':lambda x : sympy.log10(abs(x)),
|
41 |
+
'logm2': lambda x : sympy.log2(abs(x)),
|
42 |
'log1p': lambda x : sympy.log(x + 1),
|
43 |
'floor': lambda x : sympy.floor(x),
|
44 |
'ceil': lambda x : sympy.ceil(x),
|
|
|
195 |
if populations is None:
|
196 |
populations = procs
|
197 |
|
|
|
|
|
|
|
|
|
|
|
198 |
rand_string = f'{"".join([str(np.random.rand())[2] for i in range(20)])}'
|
199 |
|
200 |
if isinstance(binary_operators, str): binary_operators = [binary_operators]
|
|
|
303 |
|
304 |
|
305 |
command = [
|
306 |
+
f'julia', f'-O{julia_optimization:d}',
|
307 |
+
f'-p', f'{procs}',
|
308 |
f'/tmp/.runfile_{rand_string}.jl',
|
309 |
]
|
310 |
if timeout is not None:
|
311 |
+
command = [f'timeout', f'{timeout}'] + command
|
312 |
+
|
313 |
+
global global_n_features
|
314 |
+
global global_equation_file
|
315 |
+
global global_variable_names
|
316 |
+
global global_extra_sympy_mappings
|
317 |
+
|
318 |
+
global_n_features = X.shape[1]
|
319 |
+
global_equation_file = equation_file
|
320 |
+
global_variable_names = variable_names
|
321 |
+
global_extra_sympy_mappings = extra_sympy_mappings
|
322 |
+
|
323 |
+
print("Running on", ' '.join(command))
|
324 |
+
process = subprocess.Popen(command)
|
325 |
+
while True:
|
326 |
+
try:
|
327 |
+
process.wait()
|
328 |
+
except KeyboardInterrupt:
|
329 |
+
process.kill()
|
330 |
+
|
331 |
+
return get_hof()
|
332 |
+
|
333 |
+
|
334 |
+
def run_feature_selection(X, y, select_k_features):
|
335 |
+
"""Use a gradient boosting tree regressor as a proxy for finding
|
336 |
+
the k most important features in X, returning indices for those
|
337 |
+
features as output."""
|
338 |
+
|
339 |
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
340 |
+
from sklearn.feature_selection import SelectFromModel, SelectKBest
|
341 |
+
|
342 |
+
clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
|
343 |
+
clf.fit(X, y)
|
344 |
+
selector = SelectFromModel(clf, threshold=-np.inf,
|
345 |
+
max_features=select_k_features, prefit=True)
|
346 |
+
return selector.get_support(indices=True)
|
347 |
+
|
348 |
+
def get_hof(equation_file=None, n_features=None, variable_names=None, extra_sympy_mappings=None):
|
349 |
+
"""Get the equations from a hall of fame file. If no arguments
|
350 |
+
entered, the ones used previously from a call to PySR will be used."""
|
351 |
+
|
352 |
+
global global_n_features
|
353 |
+
global global_equation_file
|
354 |
+
global global_variable_names
|
355 |
+
global global_extra_sympy_mappings
|
356 |
+
|
357 |
+
if equation_file is None: equation_file = global_equation_file
|
358 |
+
if n_features is None: n_features = global_n_features
|
359 |
+
if variable_names is None: variable_names = global_variable_names
|
360 |
+
if extra_sympy_mappings is None: extra_sympy_mappings = global_extra_sympy_mappings
|
361 |
+
|
362 |
try:
|
363 |
+
output = pd.read_csv(equation_file + '.bkup', sep="|")
|
364 |
except FileNotFoundError:
|
365 |
print("Couldn't find equation file!")
|
366 |
return pd.DataFrame()
|
|
|
370 |
lastComplexity = 0
|
371 |
sympy_format = []
|
372 |
lambda_format = []
|
373 |
+
use_custom_variable_names = (len(variable_names) != 0)
|
374 |
+
local_sympy_mappings = {
|
375 |
+
**extra_sympy_mappings,
|
376 |
+
**sympy_mappings
|
377 |
+
}
|
378 |
+
|
379 |
if use_custom_variable_names:
|
380 |
+
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
|
381 |
else:
|
382 |
+
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(n_features)]
|
383 |
+
|
384 |
for i in range(len(output)):
|
385 |
eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
|
386 |
sympy_format.append(eqn)
|
|
|
397 |
lastMSE = curMSE
|
398 |
lastComplexity = curComplexity
|
399 |
|
|
|
400 |
output['score'] = np.array(scores)
|
401 |
output['sympy_format'] = sympy_format
|
402 |
output['lambda_format'] = lambda_format
|
|
|
|
|
403 |
|
404 |
+
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
|
406 |
+
def best_row(equations=None):
|
407 |
+
"""Return the best columns of a hall of fame file using the score column."""
|
408 |
+
if equations is None: equations = get_hof()
|
409 |
+
best_idx = np.argmax(equations['score'])
|
410 |
+
return equations.iloc[best_idx]
|
411 |
+
|
412 |
+
def best_tex(equations=None):
|
413 |
+
"""Return the equation with the best score, in latex format"""
|
414 |
+
if equations is None: equations = get_hof()
|
415 |
+
best_sympy = best_row(equations)['sympy_format']
|
416 |
+
return sympy.latex(best_sympy.simplify())
|
417 |
+
|
418 |
+
def best(equations=None):
|
419 |
+
"""Return the equation with the best score, in latex format"""
|
420 |
+
if equations is None: equations = get_hof()
|
421 |
+
best_sympy = best_row(equations)['sympy_format']
|
422 |
+
return best_sympy.simplify()
|
423 |
+
|
424 |
+
def best_tex(equations=None):
|
425 |
+
"""Return the equation with the best score, in latex format"""
|
426 |
+
if equations is None: equations = get_hof()
|
427 |
+
best_sympy = best_row(equations)['sympy_format']
|
428 |
+
return sympy.latex(best_sympy.simplify())
|
429 |
+
|
430 |
+
def best_function(equations=None):
|
431 |
+
"""Return the equation with the best score, in callable format"""
|
432 |
+
if equations is None: equations = get_hof()
|
433 |
+
return best_row(equations)['lambda_format']
|
434 |
|
435 |
|