Spaces:
Running
Running
File size: 4,670 Bytes
9fa2182 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import multiprocessing as mp
import os
import tempfile
import time
from pathlib import Path
import numpy as np
import pandas as pd
from .data import generate_data
EMPTY_DF = lambda: pd.DataFrame(
{
"Equation": [],
"Loss": [],
"Complexity": [],
}
)
def process(
file_input,
force_run,
test_equation,
num_points,
noise_level,
data_seed,
niterations,
maxsize,
binary_operators,
unary_operators,
plot_update_delay,
parsimony,
populations,
population_size,
ncycles_per_iteration,
elementwise_loss,
adaptive_parsimony_scaling,
optimizer_algorithm,
optimizer_iterations,
batching,
batch_size,
):
"""Load data, then spawn a process to run the greet function."""
if file_input is not None:
# Look at some statistics of the file:
df = pd.read_csv(file_input)
if len(df) == 0:
return (
EMPTY_DF(),
"The file is empty!",
)
if len(df.columns) == 1:
return (
EMPTY_DF(),
"The file has only one column!",
)
if len(df) > 10_000 and not force_run:
return (
EMPTY_DF(),
"You have uploaded a file with more than 10,000 rows. "
"This will take very long to run. "
"Please upload a subsample of the data, "
"or check the box 'Ignore Warnings'.",
)
col_to_fit = df.columns[-1]
y = np.array(df[col_to_fit])
X = df.drop([col_to_fit], axis=1)
else:
X, y = generate_data(test_equation, num_points, noise_level, data_seed)
with tempfile.TemporaryDirectory() as tmpdirname:
base = Path(tmpdirname)
equation_file = base / "hall_of_fame.csv"
equation_file_bkup = base / "hall_of_fame.csv.bkup"
process = mp.Process(
target=pysr_fit,
kwargs=dict(
X=X,
y=y,
niterations=niterations,
maxsize=maxsize,
binary_operators=binary_operators,
unary_operators=unary_operators,
equation_file=equation_file,
parsimony=parsimony,
populations=populations,
population_size=population_size,
ncycles_per_iteration=ncycles_per_iteration,
elementwise_loss=elementwise_loss,
adaptive_parsimony_scaling=adaptive_parsimony_scaling,
optimizer_algorithm=optimizer_algorithm,
optimizer_iterations=optimizer_iterations,
batching=batching,
batch_size=batch_size,
),
)
process.start()
last_yield_time = None
while process.is_alive():
if equation_file_bkup.exists():
try:
# First, copy the file to a the copy file
equation_file_copy = base / "hall_of_fame_copy.csv"
os.system(f"cp {equation_file_bkup} {equation_file_copy}")
equations = pd.read_csv(equation_file_copy)
# Ensure it is pareto dominated, with more complex expressions
# having higher loss. Otherwise remove those rows.
# TODO: Not sure why this occurs; could be the result of a late copy?
equations.sort_values("Complexity", ascending=True, inplace=True)
equations.reset_index(inplace=True)
bad_idx = []
min_loss = None
for i in equations.index:
if min_loss is None or equations.loc[i, "Loss"] < min_loss:
min_loss = float(equations.loc[i, "Loss"])
else:
bad_idx.append(i)
equations.drop(index=bad_idx, inplace=True)
while (
last_yield_time is not None
and time.time() - last_yield_time < plot_update_delay
):
time.sleep(0.1)
yield equations[["Complexity", "Loss", "Equation"]]
last_yield_time = time.time()
except pd.errors.EmptyDataError:
pass
process.join()
def pysr_fit(
*,
X,
y,
**pysr_kwargs,
):
import pysr
model = pysr.PySRRegressor(
progress=False,
timeout_in_seconds=1000,
**pysr_kwargs,
)
model.fit(X, y)
|