File size: 6,946 Bytes
b84549f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import contextlib
import logging
from pathlib import Path
import socket
from subprocess import Popen
import sys
import time
from typing import Optional, Tuple

import colorama

import nni_node  # pylint: disable=import-error
import nni.runtime.protocol

from .config import ExperimentConfig
from .pipe import Pipe
from . import rest
from ..tools.nnictl.config_utils import Experiments, Config
from ..tools.nnictl.nnictl_utils import update_experiment

_logger = logging.getLogger('nni.experiment')


def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bool, mode: str = 'new') -> Popen:
    proc = None

    config.validate(initialized_tuner=False)
    _ensure_port_idle(port)

    if mode != 'view':
        if isinstance(config.training_service, list): # hybrid training service
            _ensure_port_idle(port + 1, 'Hybrid training service requires an additional port')
        elif config.training_service.platform in ['remote', 'openpai', 'kubeflow', 'frameworkcontroller', 'adl']:
            _ensure_port_idle(port + 1, f'{config.training_service.platform} requires an additional port')

    try:
        _logger.info('Creating experiment, Experiment ID: %s', colorama.Fore.CYAN + exp_id + colorama.Style.RESET_ALL)
        start_time, proc = _start_rest_server(config, port, debug, exp_id, mode=mode)
        _logger.info('Starting web server...')
        _check_rest_server(port)
        platform = 'hybrid' if isinstance(config.training_service, list) else config.training_service.platform
        _save_experiment_information(exp_id, port, start_time, platform,
                                     config.experiment_name, proc.pid, str(config.experiment_working_directory))
        _logger.info('Setting up...')
        rest.post(port, '/experiment', config.json())
        return proc

    except Exception as e:
        _logger.error('Create experiment failed')
        if proc is not None:
            with contextlib.suppress(Exception):
                proc.kill()
        raise e

def start_experiment_retiarii(exp_id: str, config: ExperimentConfig, port: int, debug: bool) -> Popen:
    pipe = None
    proc = None

    config.validate(initialized_tuner=True)
    _ensure_port_idle(port)
    if isinstance(config.training_service, list): # hybrid training service
        _ensure_port_idle(port + 1, 'Hybrid training service requires an additional port')
    elif config.training_service.platform in ['remote', 'openpai', 'kubeflow', 'frameworkcontroller', 'adl']:
        _ensure_port_idle(port + 1, f'{config.training_service.platform} requires an additional port')

    try:
        _logger.info('Creating experiment, Experiment ID: %s', colorama.Fore.CYAN + exp_id + colorama.Style.RESET_ALL)
        pipe = Pipe(exp_id)
        start_time, proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
        _logger.info('Connecting IPC pipe...')
        pipe_file = pipe.connect()
        nni.runtime.protocol._in_file = pipe_file
        nni.runtime.protocol._out_file = pipe_file
        _logger.info('Starting web server...')
        _check_rest_server(port)
        platform = 'hybrid' if isinstance(config.training_service, list) else config.training_service.platform
        _save_experiment_information(exp_id, port, start_time, platform,
                                     config.experiment_name, proc.pid, config.experiment_working_directory)
        _logger.info('Setting up...')
        rest.post(port, '/experiment', config.json())
        return proc, pipe

    except Exception as e:
        _logger.error('Create experiment failed')
        if proc is not None:
            with contextlib.suppress(Exception):
                proc.kill()
        if pipe is not None:
            with contextlib.suppress(Exception):
                pipe.close()
        raise e

def _ensure_port_idle(port: int, message: Optional[str] = None) -> None:
    sock = socket.socket()
    if sock.connect_ex(('localhost', port)) == 0:
        sock.close()
        message = f'(message)' if message else ''
        raise RuntimeError(f'Port {port} is not idle {message}')


def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str = None,
                       mode: str = 'new') -> Tuple[int, Popen]:
    if isinstance(config.training_service, list):
        ts = 'hybrid'
    else:
        ts = config.training_service.platform
        if ts == 'openpai':
            ts = 'pai'

    args = {
        'port': port,
        'mode': ts,
        'experiment_id': experiment_id,
        'start_mode': mode,
        'log_dir': config.experiment_working_directory,
        'log_level': 'debug' if debug else 'info'
    }
    if pipe_path is not None:
        args['dispatcher_pipe'] = pipe_path

    if mode == 'view':
        args['start_mode'] = 'resume'
        args['readonly'] = 'true'

    node_dir = Path(nni_node.__path__[0])
    node = str(node_dir / ('node.exe' if sys.platform == 'win32' else 'node'))
    main_js = str(node_dir / 'main.js')
    cmd = [node, '--max-old-space-size=4096', main_js]
    for arg_key, arg_value in args.items():
        cmd.append('--' + arg_key)
        cmd.append(str(arg_value))

    if sys.platform == 'win32':
        from subprocess import CREATE_NEW_PROCESS_GROUP
        proc = Popen(cmd, cwd=node_dir, creationflags=CREATE_NEW_PROCESS_GROUP)
    else:
        if pipe_path is None:
            import os
            proc = Popen(cmd, cwd=node_dir, preexec_fn=os.setpgrp)
        else:
            proc = Popen(cmd, cwd=node_dir)
    return int(time.time() * 1000), proc


def _check_rest_server(port: int, retry: int = 3) -> None:
    for i in range(retry):
        with contextlib.suppress(Exception):
            rest.get(port, '/check-status')
            return
        if i > 0:
            _logger.warning('Timeout, retry...')
        time.sleep(1)
    rest.get(port, '/check-status')


def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None:
    experiments_config = Experiments()
    experiments_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir)


def get_stopped_experiment_config(exp_id: str, mode: str) -> None:
    update_experiment()
    experiments_config = Experiments()
    experiments_dict = experiments_config.get_all_experiments()
    experiment_metadata = experiments_dict.get(exp_id)
    if experiment_metadata is None:
        _logger.error('Id %s not exist!', exp_id)
        return
    if experiment_metadata['status'] != 'STOPPED':
        _logger.error('Only stopped experiments can be %sed!', mode)
        return
    experiment_config = Config(exp_id, experiment_metadata['logDir']).get_config()
    config = ExperimentConfig(**experiment_config)
    return config