File size: 5,852 Bytes
d1ceb73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
"""A basic kernel monitor with autorestarting.
This watches a kernel's state using KernelManager.is_alive and auto
restarts the kernel if it dies.
It is an incomplete base class, and must be subclassed.
"""
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
from __future__ import annotations
import time
import typing as t
from traitlets import Bool, Dict, Float, Instance, Integer, default
from traitlets.config.configurable import LoggingConfigurable
class KernelRestarter(LoggingConfigurable):
"""Monitor and autorestart a kernel."""
kernel_manager = Instance("jupyter_client.KernelManager")
debug = Bool(
False,
config=True,
help="""Whether to include every poll event in debugging output.
Has to be set explicitly, because there will be *a lot* of output.
""",
)
time_to_dead = Float(3.0, config=True, help="""Kernel heartbeat interval in seconds.""")
stable_start_time = Float(
10.0,
config=True,
help="""The time in seconds to consider the kernel to have completed a stable start up.""",
)
restart_limit = Integer(
5,
config=True,
help="""The number of consecutive autorestarts before the kernel is presumed dead.""",
)
random_ports_until_alive = Bool(
True,
config=True,
help="""Whether to choose new random ports when restarting before the kernel is alive.""",
)
_restarting = Bool(False)
_restart_count = Integer(0)
_initial_startup = Bool(True)
_last_dead = Float()
@default("_last_dead")
def _default_last_dead(self) -> float:
return time.time()
callbacks = Dict()
def _callbacks_default(self) -> dict[str, list]:
return {"restart": [], "dead": []}
def start(self) -> None:
"""Start the polling of the kernel."""
msg = "Must be implemented in a subclass"
raise NotImplementedError(msg)
def stop(self) -> None:
"""Stop the kernel polling."""
msg = "Must be implemented in a subclass"
raise NotImplementedError(msg)
def add_callback(self, f: t.Callable[..., t.Any], event: str = "restart") -> None:
"""register a callback to fire on a particular event
Possible values for event:
'restart' (default): kernel has died, and will be restarted.
'dead': restart has failed, kernel will be left dead.
"""
self.callbacks[event].append(f)
def remove_callback(self, f: t.Callable[..., t.Any], event: str = "restart") -> None:
"""unregister a callback to fire on a particular event
Possible values for event:
'restart' (default): kernel has died, and will be restarted.
'dead': restart has failed, kernel will be left dead.
"""
try:
self.callbacks[event].remove(f)
except ValueError:
pass
def _fire_callbacks(self, event: t.Any) -> None:
"""fire our callbacks for a particular event"""
for callback in self.callbacks[event]:
try:
callback()
except Exception:
self.log.error(
"KernelRestarter: %s callback %r failed",
event,
callback,
exc_info=True,
)
def poll(self) -> None:
if self.debug:
self.log.debug("Polling kernel...")
if self.kernel_manager.shutting_down:
self.log.debug("Kernel shutdown in progress...")
return
now = time.time()
if not self.kernel_manager.is_alive():
self._last_dead = now
if self._restarting:
self._restart_count += 1
else:
self._restart_count = 1
if self._restart_count > self.restart_limit:
self.log.warning("KernelRestarter: restart failed")
self._fire_callbacks("dead")
self._restarting = False
self._restart_count = 0
self.stop()
else:
newports = self.random_ports_until_alive and self._initial_startup
self.log.info(
"KernelRestarter: restarting kernel (%i/%i), %s random ports",
self._restart_count,
self.restart_limit,
"new" if newports else "keep",
)
self._fire_callbacks("restart")
self.kernel_manager.restart_kernel(now=True, newports=newports)
self._restarting = True
else:
# Since `is_alive` only tests that the kernel process is alive, it does not
# indicate that the kernel has successfully completed startup. To solve this
# correctly, we would need to wait for a kernel info reply, but it is not
# necessarily appropriate to start a kernel client + channels in the
# restarter. Therefore, we use "has been alive continuously for X time" as a
# heuristic for a stable start up.
# See https://github.com/jupyter/jupyter_client/pull/717 for details.
stable_start_time = self.stable_start_time
if self.kernel_manager.provisioner:
stable_start_time = self.kernel_manager.provisioner.get_stable_start_time(
recommended=stable_start_time
)
if self._initial_startup and now - self._last_dead >= stable_start_time:
self._initial_startup = False
if self._restarting and now - self._last_dead >= stable_start_time:
self.log.debug("KernelRestarter: restart apparently succeeded")
self._restarting = False
|