MilesCranmer commited on
Commit
117b2c3
β€’
1 Parent(s): 61ee61c

Clean up subscriptification

Browse files
Files changed (1) hide show
  1. pysr/sr.py +54 -31
pysr/sr.py CHANGED
@@ -184,7 +184,7 @@ def _check_assertions(
184
  f"Variable name {var_name} is already a function name."
185
  )
186
  # Check if alphanumeric only:
187
- if not re.match(r"^[a-zA-Z0-9_]+$", var_name):
188
  raise ValueError(
189
  f"Invalid variable name {var_name}. "
190
  "Only alphanumeric characters, numbers, "
@@ -633,6 +633,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
633
  feature_names_in_ : ndarray of shape (`n_features_in_`,)
634
  Names of features seen during :term:`fit`. Defined only when `X`
635
  has feature names that are all strings.
 
 
636
  nout_ : int
637
  Number of output dimensions.
638
  selection_mask_ : list[int] of length `select_k_features`
@@ -995,10 +997,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
995
  model.n_features_in_ = n_features_in
996
 
997
  if feature_names_in is None:
998
- model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)]
 
999
  else:
1000
  assert len(feature_names_in) == n_features_in
1001
  model.feature_names_in_ = feature_names_in
 
1002
 
1003
  if selection_mask is None:
1004
  model.selection_mask_ = np.ones(n_features_in, dtype=bool)
@@ -1384,7 +1388,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1384
  weights = check_array(weights, ensure_2d=False)
1385
  check_consistent_length(weights, y)
1386
  X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1387
- self.feature_names_in_ = _check_feature_names_in(self, variable_names)
 
 
 
 
 
 
 
 
 
 
 
1388
  variable_names = self.feature_names_in_
1389
 
1390
  # Handle multioutput data
@@ -1706,7 +1721,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1706
  Main.y,
1707
  weights=Main.weights,
1708
  niterations=int(self.niterations),
1709
- variable_names=_format_feature_names(self.feature_names_in_),
1710
  options=options,
1711
  numprocs=cprocs,
1712
  parallelism=parallelism,
@@ -2072,17 +2087,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2072
  with open(cur_filename, "r") as f:
2073
  buf = f.read()
2074
  buf = _preprocess_julia_floats(buf)
2075
- df = pd.read_csv(StringIO(buf))
2076
-
2077
- # Rename Complexity column to complexity:
2078
- df.rename(
2079
- columns={
2080
- "Complexity": "complexity",
2081
- "Loss": "loss",
2082
- "Equation": "equation",
2083
- },
2084
- inplace=True,
2085
- )
2086
 
2087
  all_outputs.append(df)
2088
  else:
@@ -2092,15 +2098,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2092
  with open(filename, "r") as f:
2093
  buf = f.read()
2094
  buf = _preprocess_julia_floats(buf)
2095
- all_outputs = [pd.read_csv(StringIO(buf))]
2096
- all_outputs[-1].rename(
2097
- columns={
2098
- "Complexity": "complexity",
2099
- "Loss": "loss",
2100
- "Equation": "equation",
2101
- },
2102
- inplace=True,
2103
- )
2104
 
2105
  except FileNotFoundError:
2106
  raise RuntimeError(
@@ -2109,6 +2109,23 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2109
  )
2110
  return all_outputs
2111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2112
  def get_hof(self):
2113
  """Get the equations from a hall of fame file.
2114
 
@@ -2411,10 +2428,16 @@ def _preprocess_julia_floats(s: str) -> str:
2411
  return s
2412
 
2413
 
2414
- def _format_feature_names(feature_names_in):
2415
- if all([f"x{i}" == feature_names_in[i] for i in range(len(feature_names_in))]):
2416
- # Use `equation_search` defaults, which are
2417
- # just the unicode versions
2418
- return None
2419
- else:
2420
- return feature_names_in.tolist()
 
 
 
 
 
 
 
184
  f"Variable name {var_name} is already a function name."
185
  )
186
  # Check if alphanumeric only:
187
+ if not re.match(r"^[β‚€β‚β‚‚β‚ƒβ‚„β‚…β‚†β‚‡β‚ˆβ‚‰a-zA-Z0-9_]+$", var_name):
188
  raise ValueError(
189
  f"Invalid variable name {var_name}. "
190
  "Only alphanumeric characters, numbers, "
 
633
  feature_names_in_ : ndarray of shape (`n_features_in_`,)
634
  Names of features seen during :term:`fit`. Defined only when `X`
635
  has feature names that are all strings.
636
+ is_default_feature_names_ : bool
637
+ Whether `feature_names_in_` was not set by the user.
638
  nout_ : int
639
  Number of output dimensions.
640
  selection_mask_ : list[int] of length `select_k_features`
 
997
  model.n_features_in_ = n_features_in
998
 
999
  if feature_names_in is None:
1000
+ model.feature_names_in_ = [f"x{_subscriptify(i)}" for i in range(n_features_in)]
1001
+ model.is_default_feature_names_ = True
1002
  else:
1003
  assert len(feature_names_in) == n_features_in
1004
  model.feature_names_in_ = feature_names_in
1005
+ model.is_default_feature_names_ = False
1006
 
1007
  if selection_mask is None:
1008
  model.selection_mask_ = np.ones(n_features_in, dtype=bool)
 
1388
  weights = check_array(weights, ensure_2d=False)
1389
  check_consistent_length(weights, y)
1390
  X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
1391
+ feature_names_in_ = _check_feature_names_in(self, variable_names, generate_names=False)
1392
+
1393
+ if feature_names_in_ is None:
1394
+ self.feature_names_in_ = [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
1395
+ # We record that we have generated the feature names
1396
+ # so that we can undo the subscriptification (for
1397
+ # SymPy compatibility).
1398
+ self.is_default_feature_names_ = True
1399
+ else:
1400
+ self.feature_names_in = feature_names_in_
1401
+ self.is_default_feature_names_ = False
1402
+
1403
  variable_names = self.feature_names_in_
1404
 
1405
  # Handle multioutput data
 
1721
  Main.y,
1722
  weights=Main.weights,
1723
  niterations=int(self.niterations),
1724
+ variable_names=self.feature_names_in_,
1725
  options=options,
1726
  numprocs=cprocs,
1727
  parallelism=parallelism,
 
2087
  with open(cur_filename, "r") as f:
2088
  buf = f.read()
2089
  buf = _preprocess_julia_floats(buf)
2090
+
2091
+ df = self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
 
 
 
 
 
 
 
 
 
2092
 
2093
  all_outputs.append(df)
2094
  else:
 
2098
  with open(filename, "r") as f:
2099
  buf = f.read()
2100
  buf = _preprocess_julia_floats(buf)
2101
+ all_outputs = [
2102
+ self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
2103
+ ]
 
 
 
 
 
 
2104
 
2105
  except FileNotFoundError:
2106
  raise RuntimeError(
 
2109
  )
2110
  return all_outputs
2111
 
2112
+ def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
2113
+ df = df.rename(
2114
+ columns={
2115
+ "Complexity": "complexity",
2116
+ "Loss": "loss",
2117
+ "Equation": "equation",
2118
+ },
2119
+ )
2120
+ # Regexp replace x₁₂₃ to x123 in `equation`:
2121
+ if self.is_default_feature_names_:
2122
+ df["equation"] = df["equation"].apply(
2123
+ lambda s: re.sub(r"x([β‚€β‚β‚‚β‚ƒβ‚„β‚…β‚†β‚‡β‚ˆβ‚‰]+)", lambda m: f"x{_undo_subscriptify(m.group(1))}", s)
2124
+ )
2125
+
2126
+ return df
2127
+
2128
+
2129
  def get_hof(self):
2130
  """Get the equations from a hall of fame file.
2131
 
 
2428
  return s
2429
 
2430
 
2431
+ def _subscriptify(i: int) -> str:
2432
+ """Converts integer to subscript text form.
2433
+
2434
+ For example, 123 -> "₁₂₃".
2435
+ """
2436
+ return "".join([chr(0x2080 + int(c)) for c in str(i)])
2437
+
2438
+ def _undo_subscriptify(s: str) -> int:
2439
+ """Converts subscript text form to integer.
2440
+
2441
+ For example, "₁₂₃" -> 123.
2442
+ """
2443
+ return int("".join([str(ord(c) - 0x2080) for c in s]))