Spaces:
Running
Running
import multiprocessing as mp | |
import os | |
import tempfile | |
import time | |
from pathlib import Path | |
import numpy as np | |
import pandas as pd | |
from .data import generate_data | |
EMPTY_DF = lambda: pd.DataFrame( | |
{ | |
"Equation": [], | |
"Loss": [], | |
"Complexity": [], | |
} | |
) | |
def process( | |
file_input, | |
force_run, | |
test_equation, | |
num_points, | |
noise_level, | |
data_seed, | |
niterations, | |
maxsize, | |
binary_operators, | |
unary_operators, | |
plot_update_delay, | |
parsimony, | |
populations, | |
population_size, | |
ncycles_per_iteration, | |
elementwise_loss, | |
adaptive_parsimony_scaling, | |
optimizer_algorithm, | |
optimizer_iterations, | |
batching, | |
batch_size, | |
): | |
"""Load data, then spawn a process to run the greet function.""" | |
if file_input is not None: | |
# Look at some statistics of the file: | |
df = pd.read_csv(file_input) | |
if len(df) == 0: | |
return ( | |
EMPTY_DF(), | |
"The file is empty!", | |
) | |
if len(df.columns) == 1: | |
return ( | |
EMPTY_DF(), | |
"The file has only one column!", | |
) | |
if len(df) > 10_000 and not force_run: | |
return ( | |
EMPTY_DF(), | |
"You have uploaded a file with more than 10,000 rows. " | |
"This will take very long to run. " | |
"Please upload a subsample of the data, " | |
"or check the box 'Ignore Warnings'.", | |
) | |
col_to_fit = df.columns[-1] | |
y = np.array(df[col_to_fit]) | |
X = df.drop([col_to_fit], axis=1) | |
else: | |
X, y = generate_data(test_equation, num_points, noise_level, data_seed) | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
base = Path(tmpdirname) | |
equation_file = base / "hall_of_fame.csv" | |
equation_file_bkup = base / "hall_of_fame.csv.bkup" | |
process = mp.Process( | |
target=pysr_fit, | |
kwargs=dict( | |
X=X, | |
y=y, | |
niterations=niterations, | |
maxsize=maxsize, | |
binary_operators=binary_operators, | |
unary_operators=unary_operators, | |
equation_file=equation_file, | |
parsimony=parsimony, | |
populations=populations, | |
population_size=population_size, | |
ncycles_per_iteration=ncycles_per_iteration, | |
elementwise_loss=elementwise_loss, | |
adaptive_parsimony_scaling=adaptive_parsimony_scaling, | |
optimizer_algorithm=optimizer_algorithm, | |
optimizer_iterations=optimizer_iterations, | |
batching=batching, | |
batch_size=batch_size, | |
), | |
) | |
process.start() | |
last_yield_time = None | |
while process.is_alive(): | |
if equation_file_bkup.exists(): | |
try: | |
# First, copy the file to a the copy file | |
equation_file_copy = base / "hall_of_fame_copy.csv" | |
os.system(f"cp {equation_file_bkup} {equation_file_copy}") | |
equations = pd.read_csv(equation_file_copy) | |
# Ensure it is pareto dominated, with more complex expressions | |
# having higher loss. Otherwise remove those rows. | |
# TODO: Not sure why this occurs; could be the result of a late copy? | |
equations.sort_values("Complexity", ascending=True, inplace=True) | |
equations.reset_index(inplace=True) | |
bad_idx = [] | |
min_loss = None | |
for i in equations.index: | |
if min_loss is None or equations.loc[i, "Loss"] < min_loss: | |
min_loss = float(equations.loc[i, "Loss"]) | |
else: | |
bad_idx.append(i) | |
equations.drop(index=bad_idx, inplace=True) | |
while ( | |
last_yield_time is not None | |
and time.time() - last_yield_time < plot_update_delay | |
): | |
time.sleep(0.1) | |
yield equations[["Complexity", "Loss", "Equation"]] | |
last_yield_time = time.time() | |
except pd.errors.EmptyDataError: | |
pass | |
process.join() | |
def pysr_fit( | |
*, | |
X, | |
y, | |
**pysr_kwargs, | |
): | |
import pysr | |
model = pysr.PySRRegressor( | |
progress=False, | |
timeout_in_seconds=1000, | |
**pysr_kwargs, | |
) | |
model.fit(X, y) | |