tttc3 commited on
Commit
3e8d44d
1 Parent(s): 3821242

Add warm_start

Browse files
Files changed (1) hide show
  1. pysr/sr.py +35 -47
pysr/sr.py CHANGED
@@ -177,7 +177,7 @@ def best_callable(*args, **kwargs): # pragma: no cover
177
  VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
178
 
179
 
180
- class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
181
  """
182
  High-performance symbolic regression.
183
 
@@ -431,6 +431,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
431
  Pass an int for reproducible results across multiple function calls.
432
  See :term:`Glossary <random_state>`.
433
 
 
 
 
 
434
  verbosity : int, default=1e9
435
  What verbosity level to use. 0 means minimal print statements.
436
 
@@ -633,6 +637,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
633
  fast_cycle=False,
634
  precision=32,
635
  random_state=None,
 
636
  verbosity=1e9,
637
  update_verbosity=None,
638
  progress=True,
@@ -717,6 +722,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
717
  self.fast_cycle = fast_cycle
718
  self.precision = precision
719
  self.random_state = random_state
 
720
  # Additional runtime parameters
721
  # - Runtime user interface
722
  self.verbosity = verbosity
@@ -914,8 +920,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
914
  if self.temp_equation_file:
915
  self.equation_file_ = self.tempdir_ / "hall_of_fame.csv"
916
  elif self.equation_file is None:
917
- date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
918
- self.equation_file_ = "hall_of_fame_" + date_time + ".csv"
 
 
 
919
  else:
920
  self.equation_file_ = self.equation_file
921
 
@@ -1433,10 +1442,13 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1433
  Fitted Estimator.
1434
  """
1435
  # Init attributes that are not specified in BaseEstimator
1436
- self.equations_ = None
1437
- self.nout_ = 1
1438
- self.selection_mask_ = None
1439
- self.raw_julia_state_ = None
 
 
 
1440
 
1441
  random_state = check_random_state(self.random_state) # For np random
1442
  seed = random_state.get_state()[1][0] # For julia random
@@ -1510,31 +1522,35 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1510
  self.equation_file_ = checkpoint_file
1511
  self.equations_ = self.get_hof()
1512
 
1513
- def _decision_function(self, X, best_equation):
1514
  """
1515
- Decide what value to predict based on the 'best' equation found
1516
- from fitting.
 
 
1517
 
1518
  Parameters
1519
  ----------
1520
  X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features)
1521
- Testing data for evaluating the model.
1522
 
1523
- best_equation : pd.Series
1524
- Selected best equation from `self.equations_`.
 
1525
 
1526
  Returns
1527
  -------
1528
- y_predicted : ndarray of shape (n_samples,) or (n_samples, nout_)
1529
- Values predicted by substituting `X` into the
1530
- :param`best_equation`.
1531
 
1532
  Raises
1533
  ------
1534
  ValueError
1535
  Raises if the `best_equation` cannot be evaluated.
1536
  """
1537
- check_is_fitted(self, attributes=["equations_", "feature_names_in_"])
 
1538
 
1539
  # When X is an numpy array or a pandas dataframe with a RangeIndex,
1540
  # the self.feature_names_in_ generated during fit, for the same X,
@@ -1542,16 +1558,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1542
  # To avoid this, convert X to a dataframe, apply the selection mask,
1543
  # and then set the column/feature_names of X to be equal to those
1544
  # generated during fit.
1545
- if isinstance(X, np.ndarray):
 
1546
  X = pd.DataFrame(X)
1547
-
1548
  if isinstance(X.columns, pd.RangeIndex):
1549
  if self.selection_mask_ is not None:
1550
  # RangeIndex enforces column order allowing columns to
1551
  # be correctly filtered with self.selection_mask_
1552
  X = X.iloc[:, self.selection_mask_]
1553
  X.columns = self.feature_names_in_
1554
-
1555
  # Without feature information, CallableEquation/lambda_format equations
1556
  # require that the column order of X matches that of the X used during
1557
  # the fitting process. _validate_data removes this feature information
@@ -1560,7 +1575,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1560
  # reordered/reindexed to match those of the transformed (denoised and
1561
  # feature selected) X in fit.
1562
  X = X.reindex(columns=self.feature_names_in_)
1563
-
1564
  X = self._validate_data(X, reset=False)
1565
 
1566
  try:
@@ -1576,32 +1590,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
1576
  "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`."
1577
  ) from error
1578
 
1579
- def predict(self, X, index=None):
1580
- """
1581
- Predict y from input X using the equation chosen by `model_selection`.
1582
-
1583
- You may see what equation is used by printing this object. X should
1584
- have the same columns as the training data.
1585
-
1586
- Parameters
1587
- ----------
1588
- X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features)
1589
- Training data.
1590
-
1591
- index : int, default=None
1592
- If you want to compute the output of an expression using a
1593
- particular row of `self.equations_`, you may specify the index here.
1594
-
1595
- Returns
1596
- -------
1597
- y_predicted : ndarray of shape (n_samples, nout_)
1598
- Values predicted by substituting `X` into the fitted symbolic
1599
- regression model.
1600
- """
1601
- self.refresh()
1602
- best_equation = self.get_best(index=index)
1603
- return self._decision_function(X, best_equation)
1604
-
1605
  def sympy(self, index=None):
1606
  """
1607
  Return sympy representation of the equation(s) chosen by `model_selection`.
 
177
  VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
178
 
179
 
180
+ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
181
  """
182
  High-performance symbolic regression.
183
 
 
431
  Pass an int for reproducible results across multiple function calls.
432
  See :term:`Glossary <random_state>`.
433
 
434
+ warm_start : bool, default=False
435
+ Tells fit to continue from where the last call to fit finished.
436
+ If false, each call to fit will be fresh, overwriting previous results.
437
+
438
  verbosity : int, default=1e9
439
  What verbosity level to use. 0 means minimal print statements.
440
 
 
637
  fast_cycle=False,
638
  precision=32,
639
  random_state=None,
640
+ warm_start=False,
641
  verbosity=1e9,
642
  update_verbosity=None,
643
  progress=True,
 
722
  self.fast_cycle = fast_cycle
723
  self.precision = precision
724
  self.random_state = random_state
725
+ self.warm_start = warm_start
726
  # Additional runtime parameters
727
  # - Runtime user interface
728
  self.verbosity = verbosity
 
920
  if self.temp_equation_file:
921
  self.equation_file_ = self.tempdir_ / "hall_of_fame.csv"
922
  elif self.equation_file is None:
923
+ if self.warm_start and self.equation_file_:
924
+ pass
925
+ else:
926
+ date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
927
+ self.equation_file_ = "hall_of_fame_" + date_time + ".csv"
928
  else:
929
  self.equation_file_ = self.equation_file
930
 
 
1442
  Fitted Estimator.
1443
  """
1444
  # Init attributes that are not specified in BaseEstimator
1445
+ if self.warm_start and hasattr(self, "raw_julia_state_"):
1446
+ pass
1447
+ else:
1448
+ self.equations_ = None
1449
+ self.nout_ = 1
1450
+ self.selection_mask_ = None
1451
+ self.raw_julia_state_ = None
1452
 
1453
  random_state = check_random_state(self.random_state) # For np random
1454
  seed = random_state.get_state()[1][0] # For julia random
 
1522
  self.equation_file_ = checkpoint_file
1523
  self.equations_ = self.get_hof()
1524
 
1525
+ def predict(self, X, index=None):
1526
  """
1527
+ Predict y from input X using the equation chosen by `model_selection`.
1528
+
1529
+ You may see what equation is used by printing this object. X should
1530
+ have the same columns as the training data.
1531
 
1532
  Parameters
1533
  ----------
1534
  X : {ndarray | pandas.DataFrame} of shape (n_samples, n_features)
1535
+ Training data.
1536
 
1537
+ index : int, default=None
1538
+ If you want to compute the output of an expression using a
1539
+ particular row of `self.equations_`, you may specify the index here.
1540
 
1541
  Returns
1542
  -------
1543
+ y_predicted : ndarray of shape (n_samples, nout_)
1544
+ Values predicted by substituting `X` into the fitted symbolic
1545
+ regression model.
1546
 
1547
  Raises
1548
  ------
1549
  ValueError
1550
  Raises if the `best_equation` cannot be evaluated.
1551
  """
1552
+ self.refresh()
1553
+ best_equation = self.get_best(index=index)
1554
 
1555
  # When X is an numpy array or a pandas dataframe with a RangeIndex,
1556
  # the self.feature_names_in_ generated during fit, for the same X,
 
1558
  # To avoid this, convert X to a dataframe, apply the selection mask,
1559
  # and then set the column/feature_names of X to be equal to those
1560
  # generated during fit.
1561
+ if not isinstance(X, pd.DataFrame):
1562
+ X = check_array(X)
1563
  X = pd.DataFrame(X)
 
1564
  if isinstance(X.columns, pd.RangeIndex):
1565
  if self.selection_mask_ is not None:
1566
  # RangeIndex enforces column order allowing columns to
1567
  # be correctly filtered with self.selection_mask_
1568
  X = X.iloc[:, self.selection_mask_]
1569
  X.columns = self.feature_names_in_
 
1570
  # Without feature information, CallableEquation/lambda_format equations
1571
  # require that the column order of X matches that of the X used during
1572
  # the fitting process. _validate_data removes this feature information
 
1575
  # reordered/reindexed to match those of the transformed (denoised and
1576
  # feature selected) X in fit.
1577
  X = X.reindex(columns=self.feature_names_in_)
 
1578
  X = self._validate_data(X, reset=False)
1579
 
1580
  try:
 
1590
  "e.g., `model.set_params(extra_sympy_mappings={'inv': lambda x: 1 / x})`."
1591
  ) from error
1592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1593
  def sympy(self, index=None):
1594
  """
1595
  Return sympy representation of the equation(s) chosen by `model_selection`.