MilesCranmer commited on
Commit
bf37f2a
1 Parent(s): ac2e8e0

Allow early quit + can pretty-print best equation

Browse files
Files changed (2) hide show
  1. pysr/__init__.py +1 -1
  2. pysr/sr.py +103 -34
pysr/__init__.py CHANGED
@@ -1 +1 @@
1
- from .sr import pysr
 
1
+ from .sr import pysr, get_hof, best, best_tex, best_function
pysr/sr.py CHANGED
@@ -6,13 +6,19 @@ import numpy as np
6
  import pandas as pd
7
  import sympy
8
  from sympy import sympify, Symbol, lambdify
 
 
 
 
 
 
9
 
10
  sympy_mappings = {
11
  'div': lambda x, y : x/y,
12
  'mult': lambda x, y : x*y,
13
  'plus': lambda x, y : x + y,
14
  'neg': lambda x : -x,
15
- 'pow': lambda x, y : sympy.sign(x)*sympy.Abs(x)**y,
16
  'cos': lambda x : sympy.cos(x),
17
  'sin': lambda x : sympy.sin(x),
18
  'tan': lambda x : sympy.tan(x),
@@ -26,13 +32,13 @@ sympy_mappings = {
26
  'acosh':lambda x : sympy.acosh(x),
27
  'asinh':lambda x : sympy.asinh(x),
28
  'atanh':lambda x : sympy.atanh(x),
29
- 'abs': lambda x : sympy.Abs(x),
30
  'mod': lambda x, y : sympy.Mod(x, y),
31
  'erf': lambda x : sympy.erf(x),
32
  'erfc': lambda x : sympy.erfc(x),
33
- 'logm': lambda x : sympy.log(sympy.Abs(x)),
34
- 'logm10':lambda x : sympy.log10(sympy.Abs(x)),
35
- 'logm2': lambda x : sympy.log2(sympy.Abs(x)),
36
  'log1p': lambda x : sympy.log(x + 1),
37
  'floor': lambda x : sympy.floor(x),
38
  'ceil': lambda x : sympy.ceil(x),
@@ -189,11 +195,6 @@ def pysr(X=None, y=None, weights=None,
189
  if populations is None:
190
  populations = procs
191
 
192
- local_sympy_mappings = {
193
- **extra_sympy_mappings,
194
- **sympy_mappings
195
- }
196
-
197
  rand_string = f'{"".join([str(np.random.rand())[2] for i in range(20)])}'
198
 
199
  if isinstance(binary_operators, str): binary_operators = [binary_operators]
@@ -302,17 +303,64 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
302
 
303
 
304
  command = [
305
- f'julia -O{julia_optimization:d}',
306
- f'-p {procs}',
307
  f'/tmp/.runfile_{rand_string}.jl',
308
  ]
309
  if timeout is not None:
310
- command = [f'timeout {timeout}'] + command
311
- cur_cmd = ' '.join(command)
312
- print("Running on", cur_cmd)
313
- os.system(cur_cmd)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  try:
315
- output = pd.read_csv(equation_file, sep="|")
316
  except FileNotFoundError:
317
  print("Couldn't find equation file!")
318
  return pd.DataFrame()
@@ -322,10 +370,17 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
322
  lastComplexity = 0
323
  sympy_format = []
324
  lambda_format = []
 
 
 
 
 
 
325
  if use_custom_variable_names:
326
- sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(X.shape[1])]
327
  else:
328
- sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(X.shape[1])]
 
329
  for i in range(len(output)):
330
  eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
331
  sympy_format.append(eqn)
@@ -342,25 +397,39 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
342
  lastMSE = curMSE
343
  lastComplexity = curComplexity
344
 
345
-
346
  output['score'] = np.array(scores)
347
  output['sympy_format'] = sympy_format
348
  output['lambda_format'] = lambda_format
349
- return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
350
-
351
 
352
- def run_feature_selection(X, y, select_k_features):
353
- """Use a gradient boosting tree regressor as a proxy for finding
354
- the k most important features in X, returning indices for those
355
- features as output."""
356
-
357
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
358
- from sklearn.feature_selection import SelectFromModel, SelectKBest
359
 
360
- clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
361
- clf.fit(X, y)
362
- selector = SelectFromModel(clf, threshold=-np.inf,
363
- max_features=select_k_features, prefit=True)
364
- return selector.get_support(indices=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
 
 
6
  import pandas as pd
7
  import sympy
8
  from sympy import sympify, Symbol, lambdify
9
+ import subprocess
10
+
11
+ global_equation_file = 'hall_of_fame.csv'
12
+ global_n_features = None
13
+ global_variable_names = []
14
+ global_extra_sympy_mappings = {}
15
 
16
  sympy_mappings = {
17
  'div': lambda x, y : x/y,
18
  'mult': lambda x, y : x*y,
19
  'plus': lambda x, y : x + y,
20
  'neg': lambda x : -x,
21
+ 'pow': lambda x, y : sympy.sign(x)*abs(x)**y,
22
  'cos': lambda x : sympy.cos(x),
23
  'sin': lambda x : sympy.sin(x),
24
  'tan': lambda x : sympy.tan(x),
 
32
  'acosh':lambda x : sympy.acosh(x),
33
  'asinh':lambda x : sympy.asinh(x),
34
  'atanh':lambda x : sympy.atanh(x),
35
+ 'abs': lambda x : abs(x),
36
  'mod': lambda x, y : sympy.Mod(x, y),
37
  'erf': lambda x : sympy.erf(x),
38
  'erfc': lambda x : sympy.erfc(x),
39
+ 'logm': lambda x : sympy.log(abs(x)),
40
+ 'logm10':lambda x : sympy.log10(abs(x)),
41
+ 'logm2': lambda x : sympy.log2(abs(x)),
42
  'log1p': lambda x : sympy.log(x + 1),
43
  'floor': lambda x : sympy.floor(x),
44
  'ceil': lambda x : sympy.ceil(x),
 
195
  if populations is None:
196
  populations = procs
197
 
 
 
 
 
 
198
  rand_string = f'{"".join([str(np.random.rand())[2] for i in range(20)])}'
199
 
200
  if isinstance(binary_operators, str): binary_operators = [binary_operators]
 
303
 
304
 
305
  command = [
306
+ f'julia', f'-O{julia_optimization:d}',
307
+ f'-p', f'{procs}',
308
  f'/tmp/.runfile_{rand_string}.jl',
309
  ]
310
  if timeout is not None:
311
+ command = [f'timeout', f'{timeout}'] + command
312
+
313
+ global global_n_features
314
+ global global_equation_file
315
+ global global_variable_names
316
+ global global_extra_sympy_mappings
317
+
318
+ global_n_features = X.shape[1]
319
+ global_equation_file = equation_file
320
+ global_variable_names = variable_names
321
+ global_extra_sympy_mappings = extra_sympy_mappings
322
+
323
+ print("Running on", ' '.join(command))
324
+ process = subprocess.Popen(command)
325
+ while True:
326
+ try:
327
+ process.wait()
328
+ except KeyboardInterrupt:
329
+ process.kill()
330
+
331
+ return get_hof()
332
+
333
+
334
+ def run_feature_selection(X, y, select_k_features):
335
+ """Use a gradient boosting tree regressor as a proxy for finding
336
+ the k most important features in X, returning indices for those
337
+ features as output."""
338
+
339
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
340
+ from sklearn.feature_selection import SelectFromModel, SelectKBest
341
+
342
+ clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
343
+ clf.fit(X, y)
344
+ selector = SelectFromModel(clf, threshold=-np.inf,
345
+ max_features=select_k_features, prefit=True)
346
+ return selector.get_support(indices=True)
347
+
348
+ def get_hof(equation_file=None, n_features=None, variable_names=None, extra_sympy_mappings=None):
349
+ """Get the equations from a hall of fame file. If no arguments
350
+ entered, the ones used previously from a call to PySR will be used."""
351
+
352
+ global global_n_features
353
+ global global_equation_file
354
+ global global_variable_names
355
+ global global_extra_sympy_mappings
356
+
357
+ if equation_file is None: equation_file = global_equation_file
358
+ if n_features is None: n_features = global_n_features
359
+ if variable_names is None: variable_names = global_variable_names
360
+ if extra_sympy_mappings is None: extra_sympy_mappings = global_extra_sympy_mappings
361
+
362
  try:
363
+ output = pd.read_csv(equation_file + '.bkup', sep="|")
364
  except FileNotFoundError:
365
  print("Couldn't find equation file!")
366
  return pd.DataFrame()
 
370
  lastComplexity = 0
371
  sympy_format = []
372
  lambda_format = []
373
+ use_custom_variable_names = (len(variable_names) != 0)
374
+ local_sympy_mappings = {
375
+ **extra_sympy_mappings,
376
+ **sympy_mappings
377
+ }
378
+
379
  if use_custom_variable_names:
380
+ sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
381
  else:
382
+ sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(n_features)]
383
+
384
  for i in range(len(output)):
385
  eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
386
  sympy_format.append(eqn)
 
397
  lastMSE = curMSE
398
  lastComplexity = curComplexity
399
 
 
400
  output['score'] = np.array(scores)
401
  output['sympy_format'] = sympy_format
402
  output['lambda_format'] = lambda_format
 
 
403
 
404
+ return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
 
 
 
 
 
 
405
 
406
+ def best_row(equations=None):
407
+ """Return the best columns of a hall of fame file using the score column."""
408
+ if equations is None: equations = get_hof()
409
+ best_idx = np.argmax(equations['score'])
410
+ return equations.iloc[best_idx]
411
+
412
+ def best_tex(equations=None):
413
+ """Return the equation with the best score, in latex format"""
414
+ if equations is None: equations = get_hof()
415
+ best_sympy = best_row(equations)['sympy_format']
416
+ return sympy.latex(best_sympy.simplify())
417
+
418
+ def best(equations=None):
419
+ """Return the equation with the best score, in latex format"""
420
+ if equations is None: equations = get_hof()
421
+ best_sympy = best_row(equations)['sympy_format']
422
+ return best_sympy.simplify()
423
+
424
+ def best_tex(equations=None):
425
+ """Return the equation with the best score, in latex format"""
426
+ if equations is None: equations = get_hof()
427
+ best_sympy = best_row(equations)['sympy_format']
428
+ return sympy.latex(best_sympy.simplify())
429
+
430
+ def best_function(equations=None):
431
+ """Return the equation with the best score, in callable format"""
432
+ if equations is None: equations = get_hof()
433
+ return best_row(equations)['lambda_format']
434
 
435