|
|
|
|
|
|
|
import contextlib |
|
import logging |
|
from pathlib import Path |
|
import socket |
|
from subprocess import Popen |
|
import sys |
|
import time |
|
from typing import Optional, Tuple |
|
|
|
import colorama |
|
|
|
import nni_node |
|
import nni.runtime.protocol |
|
|
|
from .config import ExperimentConfig |
|
from .pipe import Pipe |
|
from . import rest |
|
from ..tools.nnictl.config_utils import Experiments, Config |
|
from ..tools.nnictl.nnictl_utils import update_experiment |
|
|
|
_logger = logging.getLogger('nni.experiment') |
|
|
|
|
|
def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bool, mode: str = 'new') -> Popen: |
|
proc = None |
|
|
|
config.validate(initialized_tuner=False) |
|
_ensure_port_idle(port) |
|
|
|
if mode != 'view': |
|
if isinstance(config.training_service, list): |
|
_ensure_port_idle(port + 1, 'Hybrid training service requires an additional port') |
|
elif config.training_service.platform in ['remote', 'openpai', 'kubeflow', 'frameworkcontroller', 'adl']: |
|
_ensure_port_idle(port + 1, f'{config.training_service.platform} requires an additional port') |
|
|
|
try: |
|
_logger.info('Creating experiment, Experiment ID: %s', colorama.Fore.CYAN + exp_id + colorama.Style.RESET_ALL) |
|
start_time, proc = _start_rest_server(config, port, debug, exp_id, mode=mode) |
|
_logger.info('Starting web server...') |
|
_check_rest_server(port) |
|
platform = 'hybrid' if isinstance(config.training_service, list) else config.training_service.platform |
|
_save_experiment_information(exp_id, port, start_time, platform, |
|
config.experiment_name, proc.pid, str(config.experiment_working_directory)) |
|
_logger.info('Setting up...') |
|
rest.post(port, '/experiment', config.json()) |
|
return proc |
|
|
|
except Exception as e: |
|
_logger.error('Create experiment failed') |
|
if proc is not None: |
|
with contextlib.suppress(Exception): |
|
proc.kill() |
|
raise e |
|
|
|
def start_experiment_retiarii(exp_id: str, config: ExperimentConfig, port: int, debug: bool) -> Popen: |
|
pipe = None |
|
proc = None |
|
|
|
config.validate(initialized_tuner=True) |
|
_ensure_port_idle(port) |
|
if isinstance(config.training_service, list): |
|
_ensure_port_idle(port + 1, 'Hybrid training service requires an additional port') |
|
elif config.training_service.platform in ['remote', 'openpai', 'kubeflow', 'frameworkcontroller', 'adl']: |
|
_ensure_port_idle(port + 1, f'{config.training_service.platform} requires an additional port') |
|
|
|
try: |
|
_logger.info('Creating experiment, Experiment ID: %s', colorama.Fore.CYAN + exp_id + colorama.Style.RESET_ALL) |
|
pipe = Pipe(exp_id) |
|
start_time, proc = _start_rest_server(config, port, debug, exp_id, pipe.path) |
|
_logger.info('Connecting IPC pipe...') |
|
pipe_file = pipe.connect() |
|
nni.runtime.protocol._in_file = pipe_file |
|
nni.runtime.protocol._out_file = pipe_file |
|
_logger.info('Starting web server...') |
|
_check_rest_server(port) |
|
platform = 'hybrid' if isinstance(config.training_service, list) else config.training_service.platform |
|
_save_experiment_information(exp_id, port, start_time, platform, |
|
config.experiment_name, proc.pid, config.experiment_working_directory) |
|
_logger.info('Setting up...') |
|
rest.post(port, '/experiment', config.json()) |
|
return proc, pipe |
|
|
|
except Exception as e: |
|
_logger.error('Create experiment failed') |
|
if proc is not None: |
|
with contextlib.suppress(Exception): |
|
proc.kill() |
|
if pipe is not None: |
|
with contextlib.suppress(Exception): |
|
pipe.close() |
|
raise e |
|
|
|
def _ensure_port_idle(port: int, message: Optional[str] = None) -> None: |
|
sock = socket.socket() |
|
if sock.connect_ex(('localhost', port)) == 0: |
|
sock.close() |
|
message = f'(message)' if message else '' |
|
raise RuntimeError(f'Port {port} is not idle {message}') |
|
|
|
|
|
def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str = None, |
|
mode: str = 'new') -> Tuple[int, Popen]: |
|
if isinstance(config.training_service, list): |
|
ts = 'hybrid' |
|
else: |
|
ts = config.training_service.platform |
|
if ts == 'openpai': |
|
ts = 'pai' |
|
|
|
args = { |
|
'port': port, |
|
'mode': ts, |
|
'experiment_id': experiment_id, |
|
'start_mode': mode, |
|
'log_dir': config.experiment_working_directory, |
|
'log_level': 'debug' if debug else 'info' |
|
} |
|
if pipe_path is not None: |
|
args['dispatcher_pipe'] = pipe_path |
|
|
|
if mode == 'view': |
|
args['start_mode'] = 'resume' |
|
args['readonly'] = 'true' |
|
|
|
node_dir = Path(nni_node.__path__[0]) |
|
node = str(node_dir / ('node.exe' if sys.platform == 'win32' else 'node')) |
|
main_js = str(node_dir / 'main.js') |
|
cmd = [node, '--max-old-space-size=4096', main_js] |
|
for arg_key, arg_value in args.items(): |
|
cmd.append('--' + arg_key) |
|
cmd.append(str(arg_value)) |
|
|
|
if sys.platform == 'win32': |
|
from subprocess import CREATE_NEW_PROCESS_GROUP |
|
proc = Popen(cmd, cwd=node_dir, creationflags=CREATE_NEW_PROCESS_GROUP) |
|
else: |
|
if pipe_path is None: |
|
import os |
|
proc = Popen(cmd, cwd=node_dir, preexec_fn=os.setpgrp) |
|
else: |
|
proc = Popen(cmd, cwd=node_dir) |
|
return int(time.time() * 1000), proc |
|
|
|
|
|
def _check_rest_server(port: int, retry: int = 3) -> None: |
|
for i in range(retry): |
|
with contextlib.suppress(Exception): |
|
rest.get(port, '/check-status') |
|
return |
|
if i > 0: |
|
_logger.warning('Timeout, retry...') |
|
time.sleep(1) |
|
rest.get(port, '/check-status') |
|
|
|
|
|
def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None: |
|
experiments_config = Experiments() |
|
experiments_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir) |
|
|
|
|
|
def get_stopped_experiment_config(exp_id: str, mode: str) -> None: |
|
update_experiment() |
|
experiments_config = Experiments() |
|
experiments_dict = experiments_config.get_all_experiments() |
|
experiment_metadata = experiments_dict.get(exp_id) |
|
if experiment_metadata is None: |
|
_logger.error('Id %s not exist!', exp_id) |
|
return |
|
if experiment_metadata['status'] != 'STOPPED': |
|
_logger.error('Only stopped experiments can be %sed!', mode) |
|
return |
|
experiment_config = Config(exp_id, experiment_metadata['logDir']).get_config() |
|
config = ExperimentConfig(**experiment_config) |
|
return config |
|
|