LINC-BIT's picture
Upload 1912 files
b84549f verified
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import contextlib
import logging
from pathlib import Path
import socket
from subprocess import Popen
import sys
import time
from typing import Optional, Tuple
import colorama
import nni_node # pylint: disable=import-error
import nni.runtime.protocol
from .config import ExperimentConfig
from .pipe import Pipe
from . import rest
from ..tools.nnictl.config_utils import Experiments, Config
from ..tools.nnictl.nnictl_utils import update_experiment
_logger = logging.getLogger('nni.experiment')
def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bool, mode: str = 'new') -> Popen:
proc = None
config.validate(initialized_tuner=False)
_ensure_port_idle(port)
if mode != 'view':
if isinstance(config.training_service, list): # hybrid training service
_ensure_port_idle(port + 1, 'Hybrid training service requires an additional port')
elif config.training_service.platform in ['remote', 'openpai', 'kubeflow', 'frameworkcontroller', 'adl']:
_ensure_port_idle(port + 1, f'{config.training_service.platform} requires an additional port')
try:
_logger.info('Creating experiment, Experiment ID: %s', colorama.Fore.CYAN + exp_id + colorama.Style.RESET_ALL)
start_time, proc = _start_rest_server(config, port, debug, exp_id, mode=mode)
_logger.info('Starting web server...')
_check_rest_server(port)
platform = 'hybrid' if isinstance(config.training_service, list) else config.training_service.platform
_save_experiment_information(exp_id, port, start_time, platform,
config.experiment_name, proc.pid, str(config.experiment_working_directory))
_logger.info('Setting up...')
rest.post(port, '/experiment', config.json())
return proc
except Exception as e:
_logger.error('Create experiment failed')
if proc is not None:
with contextlib.suppress(Exception):
proc.kill()
raise e
def start_experiment_retiarii(exp_id: str, config: ExperimentConfig, port: int, debug: bool) -> Popen:
pipe = None
proc = None
config.validate(initialized_tuner=True)
_ensure_port_idle(port)
if isinstance(config.training_service, list): # hybrid training service
_ensure_port_idle(port + 1, 'Hybrid training service requires an additional port')
elif config.training_service.platform in ['remote', 'openpai', 'kubeflow', 'frameworkcontroller', 'adl']:
_ensure_port_idle(port + 1, f'{config.training_service.platform} requires an additional port')
try:
_logger.info('Creating experiment, Experiment ID: %s', colorama.Fore.CYAN + exp_id + colorama.Style.RESET_ALL)
pipe = Pipe(exp_id)
start_time, proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
_logger.info('Connecting IPC pipe...')
pipe_file = pipe.connect()
nni.runtime.protocol._in_file = pipe_file
nni.runtime.protocol._out_file = pipe_file
_logger.info('Starting web server...')
_check_rest_server(port)
platform = 'hybrid' if isinstance(config.training_service, list) else config.training_service.platform
_save_experiment_information(exp_id, port, start_time, platform,
config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...')
rest.post(port, '/experiment', config.json())
return proc, pipe
except Exception as e:
_logger.error('Create experiment failed')
if proc is not None:
with contextlib.suppress(Exception):
proc.kill()
if pipe is not None:
with contextlib.suppress(Exception):
pipe.close()
raise e
def _ensure_port_idle(port: int, message: Optional[str] = None) -> None:
sock = socket.socket()
if sock.connect_ex(('localhost', port)) == 0:
sock.close()
message = f'(message)' if message else ''
raise RuntimeError(f'Port {port} is not idle {message}')
def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str = None,
mode: str = 'new') -> Tuple[int, Popen]:
if isinstance(config.training_service, list):
ts = 'hybrid'
else:
ts = config.training_service.platform
if ts == 'openpai':
ts = 'pai'
args = {
'port': port,
'mode': ts,
'experiment_id': experiment_id,
'start_mode': mode,
'log_dir': config.experiment_working_directory,
'log_level': 'debug' if debug else 'info'
}
if pipe_path is not None:
args['dispatcher_pipe'] = pipe_path
if mode == 'view':
args['start_mode'] = 'resume'
args['readonly'] = 'true'
node_dir = Path(nni_node.__path__[0])
node = str(node_dir / ('node.exe' if sys.platform == 'win32' else 'node'))
main_js = str(node_dir / 'main.js')
cmd = [node, '--max-old-space-size=4096', main_js]
for arg_key, arg_value in args.items():
cmd.append('--' + arg_key)
cmd.append(str(arg_value))
if sys.platform == 'win32':
from subprocess import CREATE_NEW_PROCESS_GROUP
proc = Popen(cmd, cwd=node_dir, creationflags=CREATE_NEW_PROCESS_GROUP)
else:
if pipe_path is None:
import os
proc = Popen(cmd, cwd=node_dir, preexec_fn=os.setpgrp)
else:
proc = Popen(cmd, cwd=node_dir)
return int(time.time() * 1000), proc
def _check_rest_server(port: int, retry: int = 3) -> None:
for i in range(retry):
with contextlib.suppress(Exception):
rest.get(port, '/check-status')
return
if i > 0:
_logger.warning('Timeout, retry...')
time.sleep(1)
rest.get(port, '/check-status')
def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None:
experiments_config = Experiments()
experiments_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir)
def get_stopped_experiment_config(exp_id: str, mode: str) -> None:
update_experiment()
experiments_config = Experiments()
experiments_dict = experiments_config.get_all_experiments()
experiment_metadata = experiments_dict.get(exp_id)
if experiment_metadata is None:
_logger.error('Id %s not exist!', exp_id)
return
if experiment_metadata['status'] != 'STOPPED':
_logger.error('Only stopped experiments can be %sed!', mode)
return
experiment_config = Config(exp_id, experiment_metadata['logDir']).get_config()
config = ExperimentConfig(**experiment_config)
return config