MilesCranmer commited on
Commit
c2ab38b
β€’
2 Parent(s): 96d6ea9 cabda12

Merge pull request #649 from MilesCranmer/var-complexity

Browse files
.github/workflows/CI.yml CHANGED
@@ -90,7 +90,7 @@ jobs:
90
  - name: "Coveralls"
91
  env:
92
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93
- COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}
94
  COVERALLS_PARALLEL: true
95
  run: coveralls --service=github
96
 
 
90
  - name: "Coveralls"
91
  env:
92
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
93
+ COVERALLS_FLAG_NAME: test-${{ matrix.julia-version }}-${{ matrix.python-version }}-${{ matrix.test-id }}
94
  COVERALLS_PARALLEL: true
95
  run: coveralls --service=github
96
 
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "pysr"
7
- version = "0.18.4"
8
  authors = [
9
  {name = "Miles Cranmer", email = "[email protected]"},
10
  ]
@@ -41,4 +41,5 @@ dev-dependencies = [
41
  "pandas-stubs>=2.2.1.240316",
42
  "types-pytz>=2024.1.0.20240417",
43
  "types-openpyxl>=3.1.0.20240428",
 
44
  ]
 
4
 
5
  [project]
6
  name = "pysr"
7
+ version = "0.18.5"
8
  authors = [
9
  {name = "Miles Cranmer", email = "[email protected]"},
10
  ]
 
41
  "pandas-stubs>=2.2.1.240316",
42
  "types-pytz>=2024.1.0.20240417",
43
  "types-openpyxl>=3.1.0.20240428",
44
+ "coverage>=7.5.3",
45
  ]
pysr/juliapkg.json CHANGED
@@ -3,7 +3,7 @@
3
  "packages": {
4
  "SymbolicRegression": {
5
  "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
6
- "version": "=0.24.4"
7
  },
8
  "Serialization": {
9
  "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",
 
3
  "packages": {
4
  "SymbolicRegression": {
5
  "uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
6
+ "version": "=0.24.5"
7
  },
8
  "Serialization": {
9
  "uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",
pysr/sr.py CHANGED
@@ -1,8 +1,6 @@
1
  """Define the PySRRegressor scikit-learn interface."""
2
 
3
  import copy
4
- import difflib
5
- import inspect
6
  import os
7
  import pickle as pkl
8
  import re
@@ -57,6 +55,7 @@ from .utils import (
57
  _preprocess_julia_floats,
58
  _safe_check_feature_names_in,
59
  _subscriptify,
 
60
  )
61
 
62
  ALREADY_RAN = False
@@ -122,7 +121,7 @@ def _maybe_create_inline_operators(
122
  "and underscores are allowed."
123
  )
124
  if (extra_sympy_mappings is None) or (
125
- not function_name in extra_sympy_mappings
126
  ):
127
  raise ValueError(
128
  f"Custom function {function_name} is not defined in `extra_sympy_mappings`. "
@@ -139,6 +138,7 @@ def _check_assertions(
139
  X,
140
  use_custom_variable_names,
141
  variable_names,
 
142
  weights,
143
  y,
144
  X_units,
@@ -163,6 +163,13 @@ def _check_assertions(
163
  "and underscores are allowed."
164
  )
165
  assert_valid_sympy_symbol(var_name)
 
 
 
 
 
 
 
166
  if X_units is not None and len(X_units) != X.shape[1]:
167
  raise ValueError(
168
  "The number of units in `X_units` must equal the number of features in `X`."
@@ -333,7 +340,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
333
  `idx` argument to the function, which is `nothing`
334
  for non-batched, and a 1D array of indices for batched.
335
  Default is `None`.
336
- complexity_of_operators : dict[str, float]
337
  If you would like to use a complexity other than 1 for an
338
  operator, specify the complexity here. For example,
339
  `{"sin": 2, "+": 1}` would give a complexity of 2 for each use
@@ -342,10 +349,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
342
  numbers for a complexity, and the total complexity of a tree
343
  will be rounded to the nearest integer after computing.
344
  Default is `None`.
345
- complexity_of_constants : float
346
  Complexity of constants. Default is `1`.
347
- complexity_of_variables : float
348
- Complexity of variables. Default is `1`.
 
 
 
349
  parsimony : float
350
  Multiplicative factor for how much to punish complexity.
351
  Default is `0.0032`.
@@ -691,6 +701,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
691
  n_features_in_: int
692
  feature_names_in_: ArrayLike[str]
693
  display_feature_names_in_: ArrayLike[str]
 
694
  X_units_: Union[ArrayLike[str], None]
695
  y_units_: Union[str, ArrayLike[str], None]
696
  nout_: int
@@ -722,7 +733,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
722
  loss_function: Optional[str] = None,
723
  complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
724
  complexity_of_constants: Union[int, float] = 1,
725
- complexity_of_variables: Union[int, float] = 1,
726
  parsimony: float = 0.0032,
727
  dimensional_constraint_penalty: Optional[float] = None,
728
  dimensionless_constants_only: bool = False,
@@ -1344,13 +1355,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1344
  return param_container
1345
 
1346
  def _validate_and_set_fit_params(
1347
- self, X, y, Xresampled, weights, variable_names, X_units, y_units
 
 
 
 
 
 
 
 
1348
  ) -> Tuple[
1349
  ndarray,
1350
  ndarray,
1351
  Optional[ndarray],
1352
  Optional[ndarray],
1353
  ArrayLike[str],
 
1354
  Optional[ArrayLike[str]],
1355
  Optional[Union[str, ArrayLike[str]]],
1356
  ]:
@@ -1375,6 +1395,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1375
  for that particular element of y.
1376
  variable_names : ndarray of length n_features
1377
  Names of each variable in the training dataset, `X`.
 
 
1378
  X_units : list[str] of length n_features
1379
  Units of each variable in the training dataset, `X`.
1380
  y_units : str | list[str] of length n_out
@@ -1422,6 +1444,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1422
  "Please use valid names instead."
1423
  )
1424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1425
  # Data validation and feature name fetching via sklearn
1426
  # This method sets the n_features_in_ attribute
1427
  if Xresampled is not None:
@@ -1452,10 +1490,20 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1452
  else:
1453
  raise NotImplementedError("y shape not supported!")
1454
 
 
1455
  self.X_units_ = copy.deepcopy(X_units)
1456
  self.y_units_ = copy.deepcopy(y_units)
1457
 
1458
- return X, y, Xresampled, weights, variable_names, X_units, y_units
 
 
 
 
 
 
 
 
 
1459
 
1460
  def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1461
  raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
@@ -1471,6 +1519,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1471
  y: ndarray,
1472
  Xresampled: Union[ndarray, None],
1473
  variable_names: ArrayLike[str],
 
1474
  X_units: Union[ArrayLike[str], None],
1475
  y_units: Union[ArrayLike[str], str, None],
1476
  random_state: np.random.RandomState,
@@ -1493,6 +1542,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1493
  variable_names : list[str]
1494
  Names of each variable in the training dataset, `X`.
1495
  Of length `n_features`.
 
 
1496
  X_units : list[str]
1497
  Units of each variable in the training dataset, `X`.
1498
  y_units : str | list[str]
@@ -1543,6 +1594,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1543
  ],
1544
  )
1545
 
 
 
 
 
 
 
 
 
1546
  if X_units is not None:
1547
  X_units = cast(
1548
  ArrayLike[str],
@@ -1567,7 +1626,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1567
  else:
1568
  X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1569
 
1570
- return X, y, variable_names, X_units, y_units
1571
 
1572
  def _run(
1573
  self,
@@ -1624,6 +1683,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1624
 
1625
  nested_constraints = self.nested_constraints
1626
  complexity_of_operators = self.complexity_of_operators
 
1627
  cluster_manager = self.cluster_manager
1628
 
1629
  # Start julia backend processes
@@ -1668,6 +1728,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1668
  complexity_of_operators = jl.seval(complexity_of_operators_str)
1669
  # TODO: Refactor this into helper function
1670
 
 
 
 
1671
  custom_loss = jl.seval(
1672
  str(self.elementwise_loss)
1673
  if self.elementwise_loss is not None
@@ -1726,7 +1789,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1726
  una_constraints=jl_array(una_constraints),
1727
  complexity_of_operators=complexity_of_operators,
1728
  complexity_of_constants=self.complexity_of_constants,
1729
- complexity_of_variables=self.complexity_of_variables,
1730
  nested_constraints=nested_constraints,
1731
  elementwise_loss=custom_loss,
1732
  loss_function=custom_full_objective,
@@ -1871,6 +1934,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1871
  Xresampled=None,
1872
  weights=None,
1873
  variable_names: Optional[ArrayLike[str]] = None,
 
 
 
1874
  X_units: Optional[ArrayLike[str]] = None,
1875
  y_units: Optional[Union[str, ArrayLike[str]]] = None,
1876
  ) -> "PySRRegressor":
@@ -1931,6 +1997,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1931
  self.selection_mask_ = None
1932
  self.julia_state_stream_ = None
1933
  self.julia_options_stream_ = None
 
1934
  self.X_units_ = None
1935
  self.y_units_ = None
1936
 
@@ -1944,10 +2011,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1944
  Xresampled,
1945
  weights,
1946
  variable_names,
 
1947
  X_units,
1948
  y_units,
1949
  ) = self._validate_and_set_fit_params(
1950
- X, y, Xresampled, weights, variable_names, X_units, y_units
 
 
 
 
 
 
 
1951
  )
1952
 
1953
  if X.shape[0] > 10000 and not self.batching:
@@ -1965,8 +2040,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1965
  seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
1966
 
1967
  # Pre transformations (feature selection and denoising)
1968
- X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
1969
- X, y, Xresampled, variable_names, X_units, y_units, random_state
 
 
 
 
 
 
 
 
 
1970
  )
1971
 
1972
  # Warn about large feature counts (still warn if feature count is large
@@ -1993,6 +2077,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
1993
  X,
1994
  use_custom_variable_names,
1995
  variable_names,
 
1996
  weights,
1997
  y,
1998
  X_units,
@@ -2465,16 +2550,6 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
2465
  return with_preamble(table_string)
2466
 
2467
 
2468
- def _suggest_keywords(cls, k: str) -> List[str]:
2469
- valid_keywords = [
2470
- param
2471
- for param in inspect.signature(cls.__init__).parameters
2472
- if param not in ["self", "kwargs"]
2473
- ]
2474
- suggestions = difflib.get_close_matches(k, valid_keywords, n=3)
2475
- return suggestions
2476
-
2477
-
2478
  def idx_model_selection(equations: pd.DataFrame, model_selection: str):
2479
  """Select an expression and return its index."""
2480
  if model_selection == "accuracy":
 
1
  """Define the PySRRegressor scikit-learn interface."""
2
 
3
  import copy
 
 
4
  import os
5
  import pickle as pkl
6
  import re
 
55
  _preprocess_julia_floats,
56
  _safe_check_feature_names_in,
57
  _subscriptify,
58
+ _suggest_keywords,
59
  )
60
 
61
  ALREADY_RAN = False
 
121
  "and underscores are allowed."
122
  )
123
  if (extra_sympy_mappings is None) or (
124
+ function_name not in extra_sympy_mappings
125
  ):
126
  raise ValueError(
127
  f"Custom function {function_name} is not defined in `extra_sympy_mappings`. "
 
138
  X,
139
  use_custom_variable_names,
140
  variable_names,
141
+ complexity_of_variables,
142
  weights,
143
  y,
144
  X_units,
 
163
  "and underscores are allowed."
164
  )
165
  assert_valid_sympy_symbol(var_name)
166
+ if (
167
+ isinstance(complexity_of_variables, list)
168
+ and len(complexity_of_variables) != X.shape[1]
169
+ ):
170
+ raise ValueError(
171
+ "The number of elements in `complexity_of_variables` must equal the number of features in `X`."
172
+ )
173
  if X_units is not None and len(X_units) != X.shape[1]:
174
  raise ValueError(
175
  "The number of units in `X_units` must equal the number of features in `X`."
 
340
  `idx` argument to the function, which is `nothing`
341
  for non-batched, and a 1D array of indices for batched.
342
  Default is `None`.
343
+ complexity_of_operators : dict[str, Union[int, float]]
344
  If you would like to use a complexity other than 1 for an
345
  operator, specify the complexity here. For example,
346
  `{"sin": 2, "+": 1}` would give a complexity of 2 for each use
 
349
  numbers for a complexity, and the total complexity of a tree
350
  will be rounded to the nearest integer after computing.
351
  Default is `None`.
352
+ complexity_of_constants : int | float
353
  Complexity of constants. Default is `1`.
354
+ complexity_of_variables : int | float
355
+ Global complexity of variables. To set different complexities for
356
+ different variables, pass a list of complexities to the `fit` method
357
+ with keyword `complexity_of_variables`. You cannot use both.
358
+ Default is `1`.
359
  parsimony : float
360
  Multiplicative factor for how much to punish complexity.
361
  Default is `0.0032`.
 
701
  n_features_in_: int
702
  feature_names_in_: ArrayLike[str]
703
  display_feature_names_in_: ArrayLike[str]
704
+ complexity_of_variables_: Union[int, float, List[Union[int, float]], None]
705
  X_units_: Union[ArrayLike[str], None]
706
  y_units_: Union[str, ArrayLike[str], None]
707
  nout_: int
 
733
  loss_function: Optional[str] = None,
734
  complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
735
  complexity_of_constants: Union[int, float] = 1,
736
+ complexity_of_variables: Optional[Union[int, float]] = None,
737
  parsimony: float = 0.0032,
738
  dimensional_constraint_penalty: Optional[float] = None,
739
  dimensionless_constants_only: bool = False,
 
1355
  return param_container
1356
 
1357
  def _validate_and_set_fit_params(
1358
+ self,
1359
+ X,
1360
+ y,
1361
+ Xresampled,
1362
+ weights,
1363
+ variable_names,
1364
+ complexity_of_variables,
1365
+ X_units,
1366
+ y_units,
1367
  ) -> Tuple[
1368
  ndarray,
1369
  ndarray,
1370
  Optional[ndarray],
1371
  Optional[ndarray],
1372
  ArrayLike[str],
1373
+ Union[int, float, List[Union[int, float]]],
1374
  Optional[ArrayLike[str]],
1375
  Optional[Union[str, ArrayLike[str]]],
1376
  ]:
 
1395
  for that particular element of y.
1396
  variable_names : ndarray of length n_features
1397
  Names of each variable in the training dataset, `X`.
1398
+ complexity_of_variables : int | float | list[int | float]
1399
+ Complexity of each variable in the training dataset, `X`.
1400
  X_units : list[str] of length n_features
1401
  Units of each variable in the training dataset, `X`.
1402
  y_units : str | list[str] of length n_out
 
1444
  "Please use valid names instead."
1445
  )
1446
 
1447
+ if (
1448
+ complexity_of_variables is not None
1449
+ and self.complexity_of_variables is not None
1450
+ ):
1451
+ raise ValueError(
1452
+ "You cannot set `complexity_of_variables` at both `fit` and `__init__`. "
1453
+ "Pass it at `__init__` to set it to global default, OR use `fit` to set it for "
1454
+ "each variable individually."
1455
+ )
1456
+ elif complexity_of_variables is not None:
1457
+ complexity_of_variables = complexity_of_variables
1458
+ elif self.complexity_of_variables is not None:
1459
+ complexity_of_variables = self.complexity_of_variables
1460
+ else:
1461
+ complexity_of_variables = 1
1462
+
1463
  # Data validation and feature name fetching via sklearn
1464
  # This method sets the n_features_in_ attribute
1465
  if Xresampled is not None:
 
1490
  else:
1491
  raise NotImplementedError("y shape not supported!")
1492
 
1493
+ self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
1494
  self.X_units_ = copy.deepcopy(X_units)
1495
  self.y_units_ = copy.deepcopy(y_units)
1496
 
1497
+ return (
1498
+ X,
1499
+ y,
1500
+ Xresampled,
1501
+ weights,
1502
+ variable_names,
1503
+ complexity_of_variables,
1504
+ X_units,
1505
+ y_units,
1506
+ )
1507
 
1508
  def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
1509
  raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
 
1519
  y: ndarray,
1520
  Xresampled: Union[ndarray, None],
1521
  variable_names: ArrayLike[str],
1522
+ complexity_of_variables: Union[int, float, List[Union[int, float]]],
1523
  X_units: Union[ArrayLike[str], None],
1524
  y_units: Union[ArrayLike[str], str, None],
1525
  random_state: np.random.RandomState,
 
1542
  variable_names : list[str]
1543
  Names of each variable in the training dataset, `X`.
1544
  Of length `n_features`.
1545
+ complexity_of_variables : int | float | list[int | float]
1546
+ Complexity of each variable in the training dataset, `X`.
1547
  X_units : list[str]
1548
  Units of each variable in the training dataset, `X`.
1549
  y_units : str | list[str]
 
1594
  ],
1595
  )
1596
 
1597
+ if isinstance(complexity_of_variables, list):
1598
+ complexity_of_variables = [
1599
+ complexity_of_variables[i]
1600
+ for i in range(len(complexity_of_variables))
1601
+ if selection_mask[i]
1602
+ ]
1603
+ self.complexity_of_variables_ = copy.deepcopy(complexity_of_variables)
1604
+
1605
  if X_units is not None:
1606
  X_units = cast(
1607
  ArrayLike[str],
 
1626
  else:
1627
  X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
1628
 
1629
+ return X, y, variable_names, complexity_of_variables, X_units, y_units
1630
 
1631
  def _run(
1632
  self,
 
1683
 
1684
  nested_constraints = self.nested_constraints
1685
  complexity_of_operators = self.complexity_of_operators
1686
+ complexity_of_variables = self.complexity_of_variables_
1687
  cluster_manager = self.cluster_manager
1688
 
1689
  # Start julia backend processes
 
1728
  complexity_of_operators = jl.seval(complexity_of_operators_str)
1729
  # TODO: Refactor this into helper function
1730
 
1731
+ if isinstance(complexity_of_variables, list):
1732
+ complexity_of_variables = jl_array(complexity_of_variables)
1733
+
1734
  custom_loss = jl.seval(
1735
  str(self.elementwise_loss)
1736
  if self.elementwise_loss is not None
 
1789
  una_constraints=jl_array(una_constraints),
1790
  complexity_of_operators=complexity_of_operators,
1791
  complexity_of_constants=self.complexity_of_constants,
1792
+ complexity_of_variables=complexity_of_variables,
1793
  nested_constraints=nested_constraints,
1794
  elementwise_loss=custom_loss,
1795
  loss_function=custom_full_objective,
 
1934
  Xresampled=None,
1935
  weights=None,
1936
  variable_names: Optional[ArrayLike[str]] = None,
1937
+ complexity_of_variables: Optional[
1938
+ Union[int, float, List[Union[int, float]]]
1939
+ ] = None,
1940
  X_units: Optional[ArrayLike[str]] = None,
1941
  y_units: Optional[Union[str, ArrayLike[str]]] = None,
1942
  ) -> "PySRRegressor":
 
1997
  self.selection_mask_ = None
1998
  self.julia_state_stream_ = None
1999
  self.julia_options_stream_ = None
2000
+ self.complexity_of_variables_ = None
2001
  self.X_units_ = None
2002
  self.y_units_ = None
2003
 
 
2011
  Xresampled,
2012
  weights,
2013
  variable_names,
2014
+ complexity_of_variables,
2015
  X_units,
2016
  y_units,
2017
  ) = self._validate_and_set_fit_params(
2018
+ X,
2019
+ y,
2020
+ Xresampled,
2021
+ weights,
2022
+ variable_names,
2023
+ complexity_of_variables,
2024
+ X_units,
2025
+ y_units,
2026
  )
2027
 
2028
  if X.shape[0] > 10000 and not self.batching:
 
2040
  seed = cast(int, random_state.randint(0, 2**31 - 1)) # For julia random
2041
 
2042
  # Pre transformations (feature selection and denoising)
2043
+ X, y, variable_names, complexity_of_variables, X_units, y_units = (
2044
+ self._pre_transform_training_data(
2045
+ X,
2046
+ y,
2047
+ Xresampled,
2048
+ variable_names,
2049
+ complexity_of_variables,
2050
+ X_units,
2051
+ y_units,
2052
+ random_state,
2053
+ )
2054
  )
2055
 
2056
  # Warn about large feature counts (still warn if feature count is large
 
2077
  X,
2078
  use_custom_variable_names,
2079
  variable_names,
2080
+ complexity_of_variables,
2081
  weights,
2082
  y,
2083
  X_units,
 
2550
  return with_preamble(table_string)
2551
 
2552
 
 
 
 
 
 
 
 
 
 
 
2553
  def idx_model_selection(equations: pd.DataFrame, model_selection: str):
2554
  """Select an expression and return its index."""
2555
  if model_selection == "accuracy":
pysr/test/params.py CHANGED
@@ -1,6 +1,6 @@
1
  import inspect
2
 
3
- from .. import PySRRegressor
4
 
5
  DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
6
  DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
 
1
  import inspect
2
 
3
+ from pysr import PySRRegressor
4
 
5
  DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
6
  DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
pysr/test/test.py CHANGED
@@ -11,17 +11,18 @@ import pandas as pd
11
  import sympy
12
  from sklearn.utils.estimator_checks import check_estimator
13
 
14
- from .. import PySRRegressor, install, jl
15
- from ..export_latex import sympy2latex
16
- from ..feature_selection import _handle_feature_selection, run_feature_selection
17
- from ..julia_helpers import init_julia
18
- from ..sr import (
19
  _check_assertions,
20
  _process_constraints,
21
  _suggest_keywords,
22
  idx_model_selection,
23
  )
24
- from ..utils import _csv_filename_to_pkl_filename
 
25
  from .params import (
26
  DEFAULT_NCYCLES,
27
  DEFAULT_NITERATIONS,
@@ -29,6 +30,11 @@ from .params import (
29
  DEFAULT_POPULATIONS,
30
  )
31
 
 
 
 
 
 
32
 
33
  class TestPipeline(unittest.TestCase):
34
  def setUp(self):
@@ -176,6 +182,63 @@ class TestPipeline(unittest.TestCase):
176
  self.assertLessEqual(mse1, 1e-4)
177
  self.assertLessEqual(mse2, 1e-4)
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def test_multioutput_weighted_with_callable_temp_equation(self):
180
  X = self.X.copy()
181
  y = X[:, [0, 1]] ** 2
@@ -313,7 +376,10 @@ class TestPipeline(unittest.TestCase):
313
  "unused_feature": self.rstate.randn(500),
314
  }
315
  )
316
- true_fn = lambda x: np.array(x["T"] + x["x"] ** 2 + 1.323837)
 
 
 
317
  y = true_fn(X)
318
  noise = self.rstate.randn(500) * 0.01
319
  y = y + noise
@@ -372,13 +438,12 @@ class TestPipeline(unittest.TestCase):
372
 
373
  def test_load_model(self):
374
  """See if we can load a ran model from the equation file."""
375
- csv_file_data = """
376
- Complexity,Loss,Equation
377
  1,0.19951081,"1.9762075"
378
  3,0.12717344,"(f0 + 1.4724599)"
379
  4,0.104823045,"pow_abs(2.2683423, cos(f3))\""""
380
  # Strip the indents:
381
- csv_file_data = "\n".join([l.strip() for l in csv_file_data.split("\n")])
382
 
383
  for from_backup in [False, True]:
384
  rand_dir = Path(tempfile.mkdtemp())
@@ -430,7 +495,7 @@ class TestPipeline(unittest.TestCase):
430
  if os.path.exists(file_to_delete):
431
  os.remove(file_to_delete)
432
 
433
- pickle_file = rand_dir / "equations.pkl"
434
  model3 = PySRRegressor.from_file(
435
  model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
436
  )
@@ -1081,8 +1146,14 @@ class TestDimensionalConstraints(unittest.TestCase):
1081
  """This just checks the number of units passed"""
1082
  use_custom_variable_names = False
1083
  variable_names = None
 
1084
  weights = None
1085
- args = (use_custom_variable_names, variable_names, weights)
 
 
 
 
 
1086
  valid_units = [
1087
  (np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
1088
  (np.ones((10, 1)), np.ones(10), ["m/s"], None),
 
11
  import sympy
12
  from sklearn.utils.estimator_checks import check_estimator
13
 
14
+ from pysr import PySRRegressor, install, jl
15
+ from pysr.export_latex import sympy2latex
16
+ from pysr.feature_selection import _handle_feature_selection, run_feature_selection
17
+ from pysr.julia_helpers import init_julia
18
+ from pysr.sr import (
19
  _check_assertions,
20
  _process_constraints,
21
  _suggest_keywords,
22
  idx_model_selection,
23
  )
24
+ from pysr.utils import _csv_filename_to_pkl_filename
25
+
26
  from .params import (
27
  DEFAULT_NCYCLES,
28
  DEFAULT_NITERATIONS,
 
30
  DEFAULT_POPULATIONS,
31
  )
32
 
33
+ # Disables local saving:
34
+ os.environ["SYMBOLIC_REGRESSION_IS_TESTING"] = os.environ.get(
35
+ "SYMBOLIC_REGRESSION_IS_TESTING", "true"
36
+ )
37
+
38
 
39
  class TestPipeline(unittest.TestCase):
40
  def setUp(self):
 
182
  self.assertLessEqual(mse1, 1e-4)
183
  self.assertLessEqual(mse2, 1e-4)
184
 
185
+ def test_custom_variable_complexity(self):
186
+ for outer in (True, False):
187
+ for case in (1, 2):
188
+ y = self.X[:, [0, 1]]
189
+ if case == 1:
190
+ kwargs = dict(complexity_of_variables=[2, 3])
191
+ elif case == 2:
192
+ kwargs = dict(complexity_of_variables=2)
193
+
194
+ if outer:
195
+ outer_kwargs = kwargs
196
+ inner_kwargs = dict()
197
+ else:
198
+ outer_kwargs = dict()
199
+ inner_kwargs = kwargs
200
+
201
+ model = PySRRegressor(
202
+ binary_operators=["+"],
203
+ verbosity=0,
204
+ **self.default_test_kwargs,
205
+ early_stop_condition=(
206
+ f"stop_if_{case}(l, c) = l < 1e-8 && c <= {3 if case == 1 else 2}"
207
+ ),
208
+ **outer_kwargs,
209
+ )
210
+ model.fit(self.X[:, [0, 1]], y, **inner_kwargs)
211
+ self.assertLessEqual(model.get_best()[0]["loss"], 1e-8)
212
+ self.assertLessEqual(model.get_best()[1]["loss"], 1e-8)
213
+
214
+ self.assertEqual(model.get_best()[0]["complexity"], 2)
215
+ self.assertEqual(
216
+ model.get_best()[1]["complexity"], 3 if case == 1 else 2
217
+ )
218
+
219
+ def test_error_message_custom_variable_complexity(self):
220
+ X = np.ones((10, 2))
221
+ y = np.ones((10,))
222
+ model = PySRRegressor()
223
+ with self.assertRaises(ValueError) as cm:
224
+ model.fit(X, y, complexity_of_variables=[1, 2, 3])
225
+
226
+ self.assertIn(
227
+ "number of elements in `complexity_of_variables`", str(cm.exception)
228
+ )
229
+
230
+ def test_error_message_both_variable_complexity(self):
231
+ X = np.ones((10, 2))
232
+ y = np.ones((10,))
233
+ model = PySRRegressor(complexity_of_variables=[1, 2])
234
+ with self.assertRaises(ValueError) as cm:
235
+ model.fit(X, y, complexity_of_variables=[1, 2, 3])
236
+
237
+ self.assertIn(
238
+ "You cannot set `complexity_of_variables` at both `fit` and `__init__`.",
239
+ str(cm.exception),
240
+ )
241
+
242
  def test_multioutput_weighted_with_callable_temp_equation(self):
243
  X = self.X.copy()
244
  y = X[:, [0, 1]] ** 2
 
376
  "unused_feature": self.rstate.randn(500),
377
  }
378
  )
379
+
380
+ def true_fn(x):
381
+ return np.array(x["T"] + x["x"] ** 2 + 1.323837)
382
+
383
  y = true_fn(X)
384
  noise = self.rstate.randn(500) * 0.01
385
  y = y + noise
 
438
 
439
  def test_load_model(self):
440
  """See if we can load a ran model from the equation file."""
441
+ csv_file_data = """Complexity,Loss,Equation
 
442
  1,0.19951081,"1.9762075"
443
  3,0.12717344,"(f0 + 1.4724599)"
444
  4,0.104823045,"pow_abs(2.2683423, cos(f3))\""""
445
  # Strip the indents:
446
+ csv_file_data = "\n".join([line.strip() for line in csv_file_data.split("\n")])
447
 
448
  for from_backup in [False, True]:
449
  rand_dir = Path(tempfile.mkdtemp())
 
495
  if os.path.exists(file_to_delete):
496
  os.remove(file_to_delete)
497
 
498
+ # pickle_file = rand_dir / "equations.pkl"
499
  model3 = PySRRegressor.from_file(
500
  model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
501
  )
 
1146
  """This just checks the number of units passed"""
1147
  use_custom_variable_names = False
1148
  variable_names = None
1149
+ complexity_of_variables = 1
1150
  weights = None
1151
+ args = (
1152
+ use_custom_variable_names,
1153
+ variable_names,
1154
+ complexity_of_variables,
1155
+ weights,
1156
+ )
1157
  valid_units = [
1158
  (np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
1159
  (np.ones((10, 1)), np.ones(10), ["m/s"], None),
pysr/test/test_jax.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
  import pandas as pd
6
  import sympy
7
 
8
- from .. import PySRRegressor, sympy2jax
9
 
10
 
11
  class TestJAX(unittest.TestCase):
@@ -89,7 +89,10 @@ class TestJAX(unittest.TestCase):
89
  def test_feature_selection_custom_operators(self):
90
  rstate = np.random.RandomState(0)
91
  X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)})
92
- cos_approx = lambda x: 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720
 
 
 
93
  y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
94
 
95
  model = PySRRegressor(
 
5
  import pandas as pd
6
  import sympy
7
 
8
+ from pysr import PySRRegressor, sympy2jax
9
 
10
 
11
  class TestJAX(unittest.TestCase):
 
89
  def test_feature_selection_custom_operators(self):
90
  rstate = np.random.RandomState(0)
91
  X = pd.DataFrame({f"k{i}": rstate.randn(2000) for i in range(10, 21)})
92
+
93
+ def cos_approx(x):
94
+ return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720
95
+
96
  y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
97
 
98
  model = PySRRegressor(
pysr/test/test_startup.py CHANGED
@@ -9,8 +9,9 @@ from pathlib import Path
9
 
10
  import numpy as np
11
 
12
- from .. import PySRRegressor
13
- from ..julia_import import jl_version
 
14
  from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS
15
 
16
 
 
9
 
10
  import numpy as np
11
 
12
+ from pysr import PySRRegressor
13
+ from pysr.julia_import import jl_version
14
+
15
  from .params import DEFAULT_NITERATIONS, DEFAULT_POPULATIONS
16
 
17
 
pysr/test/test_torch.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  import pandas as pd
5
  import sympy
6
 
7
- from .. import PySRRegressor, sympy2torch
8
 
9
 
10
  class TestTorch(unittest.TestCase):
 
4
  import pandas as pd
5
  import sympy
6
 
7
+ from pysr import PySRRegressor, sympy2torch
8
 
9
 
10
  class TestTorch(unittest.TestCase):
pysr/utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import re
3
  from pathlib import Path
@@ -61,3 +63,13 @@ def _subscriptify(i: int) -> str:
61
  For example, 123 -> "₁₂₃".
62
  """
63
  return "".join([chr(0x2080 + int(c)) for c in str(i)])
 
 
 
 
 
 
 
 
 
 
 
1
+ import difflib
2
+ import inspect
3
  import os
4
  import re
5
  from pathlib import Path
 
63
  For example, 123 -> "₁₂₃".
64
  """
65
  return "".join([chr(0x2080 + int(c)) for c in str(i)])
66
+
67
+
68
+ def _suggest_keywords(cls, k: str) -> List[str]:
69
+ valid_keywords = [
70
+ param
71
+ for param in inspect.signature(cls.__init__).parameters
72
+ if param not in ["self", "kwargs"]
73
+ ]
74
+ suggestions = difflib.get_close_matches(k, valid_keywords, n=3)
75
+ return suggestions