|
"""Solvers for Ridge and LogisticRegression using SAG algorithm""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
|
|
import numpy as np |
|
|
|
from ..exceptions import ConvergenceWarning |
|
from ..utils import check_array |
|
from ..utils.extmath import row_norms |
|
from ..utils.validation import _check_sample_weight |
|
from ._base import make_dataset |
|
from ._sag_fast import sag32, sag64 |
|
|
|
|
|
def get_auto_step_size( |
|
max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False |
|
): |
|
"""Compute automatic step size for SAG solver. |
|
|
|
The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is |
|
the max sum of squares for over all samples. |
|
|
|
Parameters |
|
---------- |
|
max_squared_sum : float |
|
Maximum squared sum of X over samples. |
|
|
|
alpha_scaled : float |
|
Constant that multiplies the regularization term, scaled by |
|
1. / n_samples, the number of samples. |
|
|
|
loss : {'log', 'squared', 'multinomial'} |
|
The loss function used in SAG solver. |
|
|
|
fit_intercept : bool |
|
Specifies if a constant (a.k.a. bias or intercept) will be |
|
added to the decision function. |
|
|
|
n_samples : int, default=None |
|
Number of rows in X. Useful if is_saga=True. |
|
|
|
is_saga : bool, default=False |
|
Whether to return step size for the SAGA algorithm or the SAG |
|
algorithm. |
|
|
|
Returns |
|
------- |
|
step_size : float |
|
Step size used in SAG solver. |
|
|
|
References |
|
---------- |
|
Schmidt, M., Roux, N. L., & Bach, F. (2013). |
|
Minimizing finite sums with the stochastic average gradient |
|
https://hal.inria.fr/hal-00860051/document |
|
|
|
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). |
|
"SAGA: A Fast Incremental Gradient Method With Support |
|
for Non-Strongly Convex Composite Objectives" <1407.0202>` |
|
""" |
|
if loss in ("log", "multinomial"): |
|
L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled |
|
elif loss == "squared": |
|
|
|
L = max_squared_sum + int(fit_intercept) + alpha_scaled |
|
else: |
|
raise ValueError( |
|
"Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'" |
|
% loss |
|
) |
|
if is_saga: |
|
|
|
|
|
mun = min(2 * n_samples * alpha_scaled, L) |
|
step = 1.0 / (2 * L + mun) |
|
else: |
|
|
|
|
|
|
|
step = 1.0 / L |
|
return step |
|
|
|
|
|
def sag_solver( |
|
X, |
|
y, |
|
sample_weight=None, |
|
loss="log", |
|
alpha=1.0, |
|
beta=0.0, |
|
max_iter=1000, |
|
tol=0.001, |
|
verbose=0, |
|
random_state=None, |
|
check_input=True, |
|
max_squared_sum=None, |
|
warm_start_mem=None, |
|
is_saga=False, |
|
): |
|
"""SAG solver for Ridge and LogisticRegression. |
|
|
|
SAG stands for Stochastic Average Gradient: the gradient of the loss is |
|
estimated each sample at a time and the model is updated along the way with |
|
a constant learning rate. |
|
|
|
IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the |
|
same scale. You can normalize the data by using |
|
sklearn.preprocessing.StandardScaler on your data before passing it to the |
|
fit method. |
|
|
|
This implementation works with data represented as dense numpy arrays or |
|
sparse scipy arrays of floating point values for the features. It will |
|
fit the data according to squared loss or log loss. |
|
|
|
The regularizer is a penalty added to the loss function that shrinks model |
|
parameters towards the zero vector using the squared euclidean norm L2. |
|
|
|
.. versionadded:: 0.17 |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Training data. |
|
|
|
y : ndarray of shape (n_samples,) |
|
Target values. With loss='multinomial', y must be label encoded |
|
(see preprocessing.LabelEncoder). For loss='log' it must be in [0, 1]. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Weights applied to individual samples (1. for unweighted). |
|
|
|
loss : {'log', 'squared', 'multinomial'}, default='log' |
|
Loss function that will be optimized: |
|
-'log' is the binary logistic loss, as used in LogisticRegression. |
|
-'squared' is the squared loss, as used in Ridge. |
|
-'multinomial' is the multinomial logistic loss, as used in |
|
LogisticRegression. |
|
|
|
.. versionadded:: 0.18 |
|
*loss='multinomial'* |
|
|
|
alpha : float, default=1. |
|
L2 regularization term in the objective function |
|
``(0.5 * alpha * || W ||_F^2)``. |
|
|
|
beta : float, default=0. |
|
L1 regularization term in the objective function |
|
``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True. |
|
|
|
max_iter : int, default=1000 |
|
The max number of passes over the training data if the stopping |
|
criteria is not reached. |
|
|
|
tol : float, default=0.001 |
|
The stopping criteria for the weights. The iterations will stop when |
|
max(change in weights) / max(weights) < tol. |
|
|
|
verbose : int, default=0 |
|
The verbosity level. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Used when shuffling the data. Pass an int for reproducible output |
|
across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
check_input : bool, default=True |
|
If False, the input arrays X and y will not be checked. |
|
|
|
max_squared_sum : float, default=None |
|
Maximum squared sum of X over samples. If None, it will be computed, |
|
going through all the samples. The value should be precomputed |
|
to speed up cross validation. |
|
|
|
warm_start_mem : dict, default=None |
|
The initialization parameters used for warm starting. Warm starting is |
|
currently used in LogisticRegression but not in Ridge. |
|
It contains: |
|
- 'coef': the weight vector, with the intercept in last line |
|
if the intercept is fitted. |
|
- 'gradient_memory': the scalar gradient for all seen samples. |
|
- 'sum_gradient': the sum of gradient over all seen samples, |
|
for each feature. |
|
- 'intercept_sum_gradient': the sum of gradient over all seen |
|
samples, for the intercept. |
|
- 'seen': array of boolean describing the seen samples. |
|
- 'num_seen': the number of seen samples. |
|
|
|
is_saga : bool, default=False |
|
Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves |
|
better in the first epochs, and allow for l1 regularisation. |
|
|
|
Returns |
|
------- |
|
coef_ : ndarray of shape (n_features,) |
|
Weight vector. |
|
|
|
n_iter_ : int |
|
The number of full pass on all samples. |
|
|
|
warm_start_mem : dict |
|
Contains a 'coef' key with the fitted result, and possibly the |
|
fitted intercept at the end of the array. Contains also other keys |
|
used for warm starting. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn import linear_model |
|
>>> n_samples, n_features = 10, 5 |
|
>>> rng = np.random.RandomState(0) |
|
>>> X = rng.randn(n_samples, n_features) |
|
>>> y = rng.randn(n_samples) |
|
>>> clf = linear_model.Ridge(solver='sag') |
|
>>> clf.fit(X, y) |
|
Ridge(solver='sag') |
|
|
|
>>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) |
|
>>> y = np.array([1, 1, 2, 2]) |
|
>>> clf = linear_model.LogisticRegression(solver='sag') |
|
>>> clf.fit(X, y) |
|
LogisticRegression(solver='sag') |
|
|
|
References |
|
---------- |
|
Schmidt, M., Roux, N. L., & Bach, F. (2013). |
|
Minimizing finite sums with the stochastic average gradient |
|
https://hal.inria.fr/hal-00860051/document |
|
|
|
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014). |
|
"SAGA: A Fast Incremental Gradient Method With Support |
|
for Non-Strongly Convex Composite Objectives" <1407.0202>` |
|
|
|
See Also |
|
-------- |
|
Ridge, SGDRegressor, ElasticNet, Lasso, SVR, |
|
LogisticRegression, SGDClassifier, LinearSVC, Perceptron |
|
""" |
|
if warm_start_mem is None: |
|
warm_start_mem = {} |
|
|
|
if max_iter is None: |
|
max_iter = 1000 |
|
|
|
if check_input: |
|
_dtype = [np.float64, np.float32] |
|
X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C") |
|
y = check_array(y, dtype=_dtype, ensure_2d=False, order="C") |
|
|
|
n_samples, n_features = X.shape[0], X.shape[1] |
|
|
|
alpha_scaled = float(alpha) / n_samples |
|
beta_scaled = float(beta) / n_samples |
|
|
|
|
|
n_classes = int(y.max()) + 1 if loss == "multinomial" else 1 |
|
|
|
|
|
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
|
|
|
if "coef" in warm_start_mem.keys(): |
|
coef_init = warm_start_mem["coef"] |
|
else: |
|
|
|
coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C") |
|
|
|
|
|
|
|
fit_intercept = coef_init.shape[0] == (n_features + 1) |
|
if fit_intercept: |
|
intercept_init = coef_init[-1, :] |
|
coef_init = coef_init[:-1, :] |
|
else: |
|
intercept_init = np.zeros(n_classes, dtype=X.dtype) |
|
|
|
if "intercept_sum_gradient" in warm_start_mem.keys(): |
|
intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"] |
|
else: |
|
intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype) |
|
|
|
if "gradient_memory" in warm_start_mem.keys(): |
|
gradient_memory_init = warm_start_mem["gradient_memory"] |
|
else: |
|
gradient_memory_init = np.zeros( |
|
(n_samples, n_classes), dtype=X.dtype, order="C" |
|
) |
|
if "sum_gradient" in warm_start_mem.keys(): |
|
sum_gradient_init = warm_start_mem["sum_gradient"] |
|
else: |
|
sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C") |
|
|
|
if "seen" in warm_start_mem.keys(): |
|
seen_init = warm_start_mem["seen"] |
|
else: |
|
seen_init = np.zeros(n_samples, dtype=np.int32, order="C") |
|
|
|
if "num_seen" in warm_start_mem.keys(): |
|
num_seen_init = warm_start_mem["num_seen"] |
|
else: |
|
num_seen_init = 0 |
|
|
|
dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state) |
|
|
|
if max_squared_sum is None: |
|
max_squared_sum = row_norms(X, squared=True).max() |
|
step_size = get_auto_step_size( |
|
max_squared_sum, |
|
alpha_scaled, |
|
loss, |
|
fit_intercept, |
|
n_samples=n_samples, |
|
is_saga=is_saga, |
|
) |
|
if step_size * alpha_scaled == 1: |
|
raise ZeroDivisionError( |
|
"Current sag implementation does not handle " |
|
"the case step_size * alpha_scaled == 1" |
|
) |
|
|
|
sag = sag64 if X.dtype == np.float64 else sag32 |
|
num_seen, n_iter_ = sag( |
|
dataset, |
|
coef_init, |
|
intercept_init, |
|
n_samples, |
|
n_features, |
|
n_classes, |
|
tol, |
|
max_iter, |
|
loss, |
|
step_size, |
|
alpha_scaled, |
|
beta_scaled, |
|
sum_gradient_init, |
|
gradient_memory_init, |
|
seen_init, |
|
num_seen_init, |
|
fit_intercept, |
|
intercept_sum_gradient, |
|
intercept_decay, |
|
is_saga, |
|
verbose, |
|
) |
|
|
|
if n_iter_ == max_iter: |
|
warnings.warn( |
|
"The max_iter was reached which means the coef_ did not converge", |
|
ConvergenceWarning, |
|
) |
|
|
|
if fit_intercept: |
|
coef_init = np.vstack((coef_init, intercept_init)) |
|
|
|
warm_start_mem = { |
|
"coef": coef_init, |
|
"sum_gradient": sum_gradient_init, |
|
"intercept_sum_gradient": intercept_sum_gradient, |
|
"gradient_memory": gradient_memory_init, |
|
"seen": seen_init, |
|
"num_seen": num_seen, |
|
} |
|
|
|
if loss == "multinomial": |
|
coef_ = coef_init.T |
|
else: |
|
coef_ = coef_init[:, 0] |
|
|
|
return coef_, n_iter_, warm_start_mem |
|
|