Spaces:
Sleeping
Sleeping
MilesCranmer
commited on
Commit
β’
117b2c3
1
Parent(s):
61ee61c
Clean up subscriptification
Browse files- pysr/sr.py +54 -31
pysr/sr.py
CHANGED
@@ -184,7 +184,7 @@ def _check_assertions(
|
|
184 |
f"Variable name {var_name} is already a function name."
|
185 |
)
|
186 |
# Check if alphanumeric only:
|
187 |
-
if not re.match(r"^[a-zA-Z0-9_]+$", var_name):
|
188 |
raise ValueError(
|
189 |
f"Invalid variable name {var_name}. "
|
190 |
"Only alphanumeric characters, numbers, "
|
@@ -633,6 +633,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
633 |
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
634 |
Names of features seen during :term:`fit`. Defined only when `X`
|
635 |
has feature names that are all strings.
|
|
|
|
|
636 |
nout_ : int
|
637 |
Number of output dimensions.
|
638 |
selection_mask_ : list[int] of length `select_k_features`
|
@@ -995,10 +997,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
995 |
model.n_features_in_ = n_features_in
|
996 |
|
997 |
if feature_names_in is None:
|
998 |
-
model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)]
|
|
|
999 |
else:
|
1000 |
assert len(feature_names_in) == n_features_in
|
1001 |
model.feature_names_in_ = feature_names_in
|
|
|
1002 |
|
1003 |
if selection_mask is None:
|
1004 |
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
|
@@ -1384,7 +1388,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1384 |
weights = check_array(weights, ensure_2d=False)
|
1385 |
check_consistent_length(weights, y)
|
1386 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1388 |
variable_names = self.feature_names_in_
|
1389 |
|
1390 |
# Handle multioutput data
|
@@ -1706,7 +1721,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
1706 |
Main.y,
|
1707 |
weights=Main.weights,
|
1708 |
niterations=int(self.niterations),
|
1709 |
-
variable_names=
|
1710 |
options=options,
|
1711 |
numprocs=cprocs,
|
1712 |
parallelism=parallelism,
|
@@ -2072,17 +2087,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2072 |
with open(cur_filename, "r") as f:
|
2073 |
buf = f.read()
|
2074 |
buf = _preprocess_julia_floats(buf)
|
2075 |
-
|
2076 |
-
|
2077 |
-
# Rename Complexity column to complexity:
|
2078 |
-
df.rename(
|
2079 |
-
columns={
|
2080 |
-
"Complexity": "complexity",
|
2081 |
-
"Loss": "loss",
|
2082 |
-
"Equation": "equation",
|
2083 |
-
},
|
2084 |
-
inplace=True,
|
2085 |
-
)
|
2086 |
|
2087 |
all_outputs.append(df)
|
2088 |
else:
|
@@ -2092,15 +2098,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2092 |
with open(filename, "r") as f:
|
2093 |
buf = f.read()
|
2094 |
buf = _preprocess_julia_floats(buf)
|
2095 |
-
all_outputs = [
|
2096 |
-
|
2097 |
-
|
2098 |
-
"Complexity": "complexity",
|
2099 |
-
"Loss": "loss",
|
2100 |
-
"Equation": "equation",
|
2101 |
-
},
|
2102 |
-
inplace=True,
|
2103 |
-
)
|
2104 |
|
2105 |
except FileNotFoundError:
|
2106 |
raise RuntimeError(
|
@@ -2109,6 +2109,23 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
2109 |
)
|
2110 |
return all_outputs
|
2111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2112 |
def get_hof(self):
|
2113 |
"""Get the equations from a hall of fame file.
|
2114 |
|
@@ -2411,10 +2428,16 @@ def _preprocess_julia_floats(s: str) -> str:
|
|
2411 |
return s
|
2412 |
|
2413 |
|
2414 |
-
def
|
2415 |
-
|
2416 |
-
|
2417 |
-
|
2418 |
-
|
2419 |
-
|
2420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
f"Variable name {var_name} is already a function name."
|
185 |
)
|
186 |
# Check if alphanumeric only:
|
187 |
+
if not re.match(r"^[ββββββ
ββββa-zA-Z0-9_]+$", var_name):
|
188 |
raise ValueError(
|
189 |
f"Invalid variable name {var_name}. "
|
190 |
"Only alphanumeric characters, numbers, "
|
|
|
633 |
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
634 |
Names of features seen during :term:`fit`. Defined only when `X`
|
635 |
has feature names that are all strings.
|
636 |
+
is_default_feature_names_ : bool
|
637 |
+
Whether `feature_names_in_` was not set by the user.
|
638 |
nout_ : int
|
639 |
Number of output dimensions.
|
640 |
selection_mask_ : list[int] of length `select_k_features`
|
|
|
997 |
model.n_features_in_ = n_features_in
|
998 |
|
999 |
if feature_names_in is None:
|
1000 |
+
model.feature_names_in_ = [f"x{_subscriptify(i)}" for i in range(n_features_in)]
|
1001 |
+
model.is_default_feature_names_ = True
|
1002 |
else:
|
1003 |
assert len(feature_names_in) == n_features_in
|
1004 |
model.feature_names_in_ = feature_names_in
|
1005 |
+
model.is_default_feature_names_ = False
|
1006 |
|
1007 |
if selection_mask is None:
|
1008 |
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
|
|
|
1388 |
weights = check_array(weights, ensure_2d=False)
|
1389 |
check_consistent_length(weights, y)
|
1390 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
1391 |
+
feature_names_in_ = _check_feature_names_in(self, variable_names, generate_names=False)
|
1392 |
+
|
1393 |
+
if feature_names_in_ is None:
|
1394 |
+
self.feature_names_in_ = [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
1395 |
+
# We record that we have generated the feature names
|
1396 |
+
# so that we can undo the subscriptification (for
|
1397 |
+
# SymPy compatibility).
|
1398 |
+
self.is_default_feature_names_ = True
|
1399 |
+
else:
|
1400 |
+
self.feature_names_in = feature_names_in_
|
1401 |
+
self.is_default_feature_names_ = False
|
1402 |
+
|
1403 |
variable_names = self.feature_names_in_
|
1404 |
|
1405 |
# Handle multioutput data
|
|
|
1721 |
Main.y,
|
1722 |
weights=Main.weights,
|
1723 |
niterations=int(self.niterations),
|
1724 |
+
variable_names=self.feature_names_in_,
|
1725 |
options=options,
|
1726 |
numprocs=cprocs,
|
1727 |
parallelism=parallelism,
|
|
|
2087 |
with open(cur_filename, "r") as f:
|
2088 |
buf = f.read()
|
2089 |
buf = _preprocess_julia_floats(buf)
|
2090 |
+
|
2091 |
+
df = self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2092 |
|
2093 |
all_outputs.append(df)
|
2094 |
else:
|
|
|
2098 |
with open(filename, "r") as f:
|
2099 |
buf = f.read()
|
2100 |
buf = _preprocess_julia_floats(buf)
|
2101 |
+
all_outputs = [
|
2102 |
+
self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
|
2103 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
2104 |
|
2105 |
except FileNotFoundError:
|
2106 |
raise RuntimeError(
|
|
|
2109 |
)
|
2110 |
return all_outputs
|
2111 |
|
2112 |
+
def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
2113 |
+
df = df.rename(
|
2114 |
+
columns={
|
2115 |
+
"Complexity": "complexity",
|
2116 |
+
"Loss": "loss",
|
2117 |
+
"Equation": "equation",
|
2118 |
+
},
|
2119 |
+
)
|
2120 |
+
# Regexp replace xβββ to x123 in `equation`:
|
2121 |
+
if self.is_default_feature_names_:
|
2122 |
+
df["equation"] = df["equation"].apply(
|
2123 |
+
lambda s: re.sub(r"x([ββββββ
ββββ]+)", lambda m: f"x{_undo_subscriptify(m.group(1))}", s)
|
2124 |
+
)
|
2125 |
+
|
2126 |
+
return df
|
2127 |
+
|
2128 |
+
|
2129 |
def get_hof(self):
|
2130 |
"""Get the equations from a hall of fame file.
|
2131 |
|
|
|
2428 |
return s
|
2429 |
|
2430 |
|
2431 |
+
def _subscriptify(i: int) -> str:
|
2432 |
+
"""Converts integer to subscript text form.
|
2433 |
+
|
2434 |
+
For example, 123 -> "βββ".
|
2435 |
+
"""
|
2436 |
+
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
2437 |
+
|
2438 |
+
def _undo_subscriptify(s: str) -> int:
|
2439 |
+
"""Converts subscript text form to integer.
|
2440 |
+
|
2441 |
+
For example, "βββ" -> 123.
|
2442 |
+
"""
|
2443 |
+
return int("".join([str(ord(c) - 0x2080) for c in s]))
|