|
""" |
|
Our own implementation of the Newton algorithm |
|
|
|
Unlike the scipy.optimize version, this version of the Newton conjugate |
|
gradient solver uses only one function call to retrieve the |
|
func value, the gradient value and a callable for the Hessian matvec |
|
product. If the function call is very expensive (e.g. for logistic |
|
regression with large design matrix), this approach gives very |
|
significant speedups. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import warnings |
|
|
|
import numpy as np |
|
import scipy |
|
|
|
from ..exceptions import ConvergenceWarning |
|
from .fixes import line_search_wolfe1, line_search_wolfe2 |
|
|
|
|
|
class _LineSearchError(RuntimeError): |
|
pass |
|
|
|
|
|
def _line_search_wolfe12( |
|
f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs |
|
): |
|
""" |
|
Same as line_search_wolfe1, but fall back to line_search_wolfe2 if |
|
suitable step length is not found, and raise an exception if a |
|
suitable step length is not found. |
|
|
|
Raises |
|
------ |
|
_LineSearchError |
|
If no suitable step size is found. |
|
|
|
""" |
|
is_verbose = verbose >= 2 |
|
eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps |
|
if is_verbose: |
|
print(" Line Search") |
|
print(f" eps=16 * finfo.eps={eps}") |
|
print(" try line search wolfe1") |
|
|
|
ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs) |
|
|
|
if is_verbose: |
|
_not_ = "not " if ret[0] is None else "" |
|
print(" wolfe1 line search was " + _not_ + "successful") |
|
|
|
if ret[0] is None: |
|
|
|
|
|
|
|
args = kwargs.get("args", tuple()) |
|
fval = f(xk + pk, *args) |
|
tiny_loss = np.abs(old_fval * eps) |
|
loss_improvement = fval - old_fval |
|
check = np.abs(loss_improvement) <= tiny_loss |
|
if is_verbose: |
|
print( |
|
" check loss |improvement| <= eps * |loss_old|:" |
|
f" {np.abs(loss_improvement)} <= {tiny_loss} {check}" |
|
) |
|
if check: |
|
|
|
sum_abs_grad_old = scipy.linalg.norm(gfk, ord=1) |
|
grad = fprime(xk + pk, *args) |
|
sum_abs_grad = scipy.linalg.norm(grad, ord=1) |
|
check = sum_abs_grad < sum_abs_grad_old |
|
if is_verbose: |
|
print( |
|
" check sum(|gradient|) < sum(|gradient_old|): " |
|
f"{sum_abs_grad} < {sum_abs_grad_old} {check}" |
|
) |
|
if check: |
|
ret = ( |
|
1.0, |
|
ret[1] + 1, |
|
ret[2] + 1, |
|
fval, |
|
old_fval, |
|
grad, |
|
) |
|
|
|
if ret[0] is None: |
|
|
|
|
|
|
|
|
|
if is_verbose: |
|
print(" last resort: try line search wolfe2") |
|
ret = line_search_wolfe2( |
|
f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs |
|
) |
|
if is_verbose: |
|
_not_ = "not " if ret[0] is None else "" |
|
print(" wolfe2 line search was " + _not_ + "successful") |
|
|
|
if ret[0] is None: |
|
raise _LineSearchError() |
|
|
|
return ret |
|
|
|
|
|
def _cg(fhess_p, fgrad, maxiter, tol, verbose=0): |
|
""" |
|
Solve iteratively the linear system 'fhess_p . xsupi = fgrad' |
|
with a conjugate gradient descent. |
|
|
|
Parameters |
|
---------- |
|
fhess_p : callable |
|
Function that takes the gradient as a parameter and returns the |
|
matrix product of the Hessian and gradient. |
|
|
|
fgrad : ndarray of shape (n_features,) or (n_features + 1,) |
|
Gradient vector. |
|
|
|
maxiter : int |
|
Number of CG iterations. |
|
|
|
tol : float |
|
Stopping criterion. |
|
|
|
Returns |
|
------- |
|
xsupi : ndarray of shape (n_features,) or (n_features + 1,) |
|
Estimated solution. |
|
""" |
|
eps = 16 * np.finfo(np.float64).eps |
|
xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype) |
|
ri = np.copy(fgrad) |
|
psupi = -ri |
|
i = 0 |
|
dri0 = np.dot(ri, ri) |
|
|
|
psupi_norm2 = dri0 |
|
is_verbose = verbose >= 2 |
|
|
|
while i <= maxiter: |
|
if np.sum(np.abs(ri)) <= tol: |
|
if is_verbose: |
|
print( |
|
f" Inner CG solver iteration {i} stopped with\n" |
|
f" sum(|residuals|) <= tol: {np.sum(np.abs(ri))} <= {tol}" |
|
) |
|
break |
|
|
|
Ap = fhess_p(psupi) |
|
|
|
curv = np.dot(psupi, Ap) |
|
if 0 <= curv <= eps * psupi_norm2: |
|
|
|
if is_verbose: |
|
print( |
|
f" Inner CG solver iteration {i} stopped with\n" |
|
f" tiny_|p| = eps * ||p||^2, eps = {eps}, " |
|
f"squred L2 norm ||p||^2 = {psupi_norm2}\n" |
|
f" curvature <= tiny_|p|: {curv} <= {eps * psupi_norm2}" |
|
) |
|
break |
|
elif curv < 0: |
|
if i > 0: |
|
if is_verbose: |
|
print( |
|
f" Inner CG solver iteration {i} stopped with negative " |
|
f"curvature, curvature = {curv}" |
|
) |
|
break |
|
else: |
|
|
|
xsupi += dri0 / curv * psupi |
|
if is_verbose: |
|
print(" Inner CG solver iteration 0 fell back to steepest descent") |
|
break |
|
alphai = dri0 / curv |
|
xsupi += alphai * psupi |
|
ri += alphai * Ap |
|
dri1 = np.dot(ri, ri) |
|
betai = dri1 / dri0 |
|
psupi = -ri + betai * psupi |
|
|
|
psupi_norm2 = dri1 + betai**2 * psupi_norm2 |
|
i = i + 1 |
|
dri0 = dri1 |
|
if is_verbose and i > maxiter: |
|
print( |
|
f" Inner CG solver stopped reaching maxiter={i - 1} with " |
|
f"sum(|residuals|) = {np.sum(np.abs(ri))}" |
|
) |
|
return xsupi |
|
|
|
|
|
def _newton_cg( |
|
grad_hess, |
|
func, |
|
grad, |
|
x0, |
|
args=(), |
|
tol=1e-4, |
|
maxiter=100, |
|
maxinner=200, |
|
line_search=True, |
|
warn=True, |
|
verbose=0, |
|
): |
|
""" |
|
Minimization of scalar function of one or more variables using the |
|
Newton-CG algorithm. |
|
|
|
Parameters |
|
---------- |
|
grad_hess : callable |
|
Should return the gradient and a callable returning the matvec product |
|
of the Hessian. |
|
|
|
func : callable |
|
Should return the value of the function. |
|
|
|
grad : callable |
|
Should return the function value and the gradient. This is used |
|
by the linesearch functions. |
|
|
|
x0 : array of float |
|
Initial guess. |
|
|
|
args : tuple, default=() |
|
Arguments passed to func_grad_hess, func and grad. |
|
|
|
tol : float, default=1e-4 |
|
Stopping criterion. The iteration will stop when |
|
``max{|g_i | i = 1, ..., n} <= tol`` |
|
where ``g_i`` is the i-th component of the gradient. |
|
|
|
maxiter : int, default=100 |
|
Number of Newton iterations. |
|
|
|
maxinner : int, default=200 |
|
Number of CG iterations. |
|
|
|
line_search : bool, default=True |
|
Whether to use a line search or not. |
|
|
|
warn : bool, default=True |
|
Whether to warn when didn't converge. |
|
|
|
Returns |
|
------- |
|
xk : ndarray of float |
|
Estimated minimum. |
|
""" |
|
x0 = np.asarray(x0).flatten() |
|
xk = np.copy(x0) |
|
k = 0 |
|
|
|
if line_search: |
|
old_fval = func(x0, *args) |
|
old_old_fval = None |
|
else: |
|
old_fval = 0 |
|
|
|
is_verbose = verbose > 0 |
|
|
|
|
|
while k < maxiter: |
|
|
|
|
|
fgrad, fhess_p = grad_hess(xk, *args) |
|
|
|
absgrad = np.abs(fgrad) |
|
max_absgrad = np.max(absgrad) |
|
check = max_absgrad <= tol |
|
if is_verbose: |
|
print(f"Newton-CG iter = {k}") |
|
print(" Check Convergence") |
|
print(f" max |gradient| <= tol: {max_absgrad} <= {tol} {check}") |
|
if check: |
|
break |
|
|
|
maggrad = np.sum(absgrad) |
|
eta = min([0.5, np.sqrt(maggrad)]) |
|
termcond = eta * maggrad |
|
|
|
|
|
|
|
xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose) |
|
|
|
alphak = 1.0 |
|
|
|
if line_search: |
|
try: |
|
alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12( |
|
func, |
|
grad, |
|
xk, |
|
xsupi, |
|
fgrad, |
|
old_fval, |
|
old_old_fval, |
|
verbose=verbose, |
|
args=args, |
|
) |
|
except _LineSearchError: |
|
warnings.warn("Line Search failed") |
|
break |
|
|
|
xk += alphak * xsupi |
|
k += 1 |
|
|
|
if warn and k >= maxiter: |
|
warnings.warn( |
|
( |
|
f"newton-cg failed to converge at loss = {old_fval}. Increase the" |
|
" number of iterations." |
|
), |
|
ConvergenceWarning, |
|
) |
|
elif is_verbose: |
|
print(f" Solver did converge at loss = {old_fval}.") |
|
return xk, k |
|
|
|
|
|
def _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None): |
|
"""Check the OptimizeResult for successful convergence |
|
|
|
Parameters |
|
---------- |
|
solver : str |
|
Solver name. Currently only `lbfgs` is supported. |
|
|
|
result : OptimizeResult |
|
Result of the scipy.optimize.minimize function. |
|
|
|
max_iter : int, default=None |
|
Expected maximum number of iterations. |
|
|
|
extra_warning_msg : str, default=None |
|
Extra warning message. |
|
|
|
Returns |
|
------- |
|
n_iter : int |
|
Number of iterations. |
|
""" |
|
|
|
if solver == "lbfgs": |
|
if result.status != 0: |
|
result_message = result.message |
|
|
|
warning_msg = ( |
|
"{} failed to converge (status={}):\n{}.\n\n" |
|
"Increase the number of iterations (max_iter) " |
|
"or scale the data as shown in:\n" |
|
" https://scikit-learn.org/stable/modules/" |
|
"preprocessing.html" |
|
).format(solver, result.status, result_message) |
|
if extra_warning_msg is not None: |
|
warning_msg += "\n" + extra_warning_msg |
|
warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2) |
|
if max_iter is not None: |
|
|
|
|
|
n_iter_i = min(result.nit, max_iter) |
|
else: |
|
n_iter_i = result.nit |
|
else: |
|
raise NotImplementedError |
|
|
|
return n_iter_i |
|
|