|
|
|
|
|
|
|
import copy |
|
import logging |
|
|
|
from .common import ExperimentConfig, AlgorithmConfig, CustomAlgorithmConfig |
|
from .remote import RemoteMachineConfig |
|
from .kubeflow import KubeflowRoleConfig, KubeflowNfsConfig, KubeflowAzureStorageConfig |
|
from .frameworkcontroller import FrameworkControllerRoleConfig |
|
from .shared_storage import NfsConfig, AzureBlobConfig |
|
from . import util |
|
|
|
_logger = logging.getLogger(__name__) |
|
|
|
def to_v2(v1) -> ExperimentConfig: |
|
v1 = copy.deepcopy(v1) |
|
platform = v1.pop('trainingServicePlatform') |
|
assert platform in ['local', 'remote', 'openpai', 'aml'] |
|
v2 = ExperimentConfig(platform) |
|
|
|
_drop_field(v1, 'authorName') |
|
_move_field(v1, v2, 'experimentName', 'experiment_name') |
|
_drop_field(v1, 'description') |
|
_move_field(v1, v2, 'trialConcurrency', 'trial_concurrency') |
|
_move_field(v1, v2, 'maxExecDuration', 'max_experiment_duration') |
|
if isinstance(v2.max_experiment_duration, (int, float)): |
|
v2.max_experiment_duration = str(v2.max_experiment_duration) + 's' |
|
_move_field(v1, v2, 'maxTrialNum', 'max_trial_number') |
|
_move_field(v1, v2, 'searchSpacePath', 'search_space_file') |
|
assert not v1.pop('multiPhase', None), 'Multi-phase is no longer supported' |
|
_deprecate(v1, v2, 'multiThread') |
|
_move_field(v1, v2, 'nniManagerIp', 'nni_manager_ip') |
|
_move_field(v1, v2, 'logDir', 'experiment_working_directory') |
|
_move_field(v1, v2, 'debug', 'debug') |
|
_deprecate(v1, v2, 'versionCheck') |
|
_move_field(v1, v2, 'logLevel', 'log_level') |
|
_deprecate(v1, v2, 'logCollection') |
|
v1.pop('useAnnotation', None) |
|
|
|
if 'trial' in v1: |
|
v1_trial = v1.pop('trial') |
|
_move_field(v1_trial, v2, 'command', 'trial_command') |
|
_move_field(v1_trial, v2, 'codeDir', 'trial_code_directory') |
|
_move_field(v1_trial, v2, 'gpuNum', 'trial_gpu_number') |
|
|
|
for algo_type in ['tuner', 'assessor', 'advisor']: |
|
if algo_type in v1: |
|
convert_algo(algo_type, v1, v2) |
|
|
|
ts = v2.training_service |
|
|
|
if platform == 'local': |
|
local_config = v1.pop('localConfig', {}) |
|
_move_field(local_config, ts, 'gpuIndices', 'gpu_indices') |
|
_move_field(local_config, ts, 'maxTrialNumPerGpu', 'max_trial_number_per_gpu') |
|
_move_field(local_config, ts, 'useActiveGpu', 'use_active_gpu') |
|
assert not local_config, local_config |
|
|
|
if platform == 'remote': |
|
remote_config = v1.pop('remoteConfig', {}) |
|
_move_field(remote_config, ts, 'reuse', 'reuse_mode') |
|
assert not remote_config, remote_config |
|
|
|
ts.machine_list = [] |
|
for v1_machine in v1.pop('machineList'): |
|
v2_machine = RemoteMachineConfig() |
|
ts.machine_list.append(v2_machine) |
|
_move_field(v1_machine, v2_machine, 'ip', 'host') |
|
_move_field(v1_machine, v2_machine, 'port', 'port') |
|
_move_field(v1_machine, v2_machine, 'username', 'user') |
|
_move_field(v1_machine, v2_machine, 'sshKeyPath', 'ssh_key_file') |
|
_move_field(v1_machine, v2_machine, 'passphrase', 'ssh_passphrase') |
|
_move_field(v1_machine, v2_machine, 'gpuIndices', 'gpu_indices') |
|
_move_field(v1_machine, v2_machine, 'maxTrialNumPerGpu', 'max_trial_number_per_gpu') |
|
_move_field(v1_machine, v2_machine, 'useActiveGpu', 'use_active_gpu') |
|
_move_field(v1_machine, v2_machine, 'pythonPath', 'python_path') |
|
_move_field(v1_machine, v2_machine, 'passwd', 'password') |
|
assert not v1_machine, v1_machine |
|
|
|
if platform == 'openpai': |
|
_move_field(v1_trial, ts, 'nniManagerNFSMountPath', 'local_storage_mount_point') |
|
_move_field(v1_trial, ts, 'containerNFSMountPath', 'container_storage_mount_point') |
|
_move_field(v1_trial, ts, 'cpuNum', 'trial_cpu_number') |
|
if 'memoryMB' in v1_trial: |
|
ts.trial_memory_size = str(v1_trial.pop('memoryMB')) + 'mb' |
|
_move_field(v1_trial, ts, 'image', 'docker_image') |
|
_deprecate(v1_trial, v2, 'virtualCluster') |
|
_move_field(v1_trial, ts, 'paiStorageConfigName', 'storage_config_name') |
|
_move_field(v1_trial, ts, 'paiConfigPath', 'openpaiConfigFile') |
|
|
|
pai_config = v1.pop('paiConfig') |
|
_move_field(pai_config, ts, 'userName', 'username') |
|
_deprecate(pai_config, v2, 'password') |
|
_move_field(pai_config, ts, 'token', 'token') |
|
_move_field(pai_config, ts, 'host', 'host') |
|
_move_field(pai_config, ts, 'reuse', 'reuse_mode') |
|
_move_field(pai_config, ts, 'gpuNum', 'trial_gpu_number') |
|
_move_field(pai_config, ts, 'cpuNum', 'trial_cpu_number') |
|
if 'memoryMB' in pai_config: |
|
ts.trial_memory_size = str(pai_config.pop('memoryMB')) + 'mb' |
|
_deprecate(pai_config, v2, 'maxTrialNumPerGpu') |
|
_deprecate(pai_config, v2, 'useActiveGpu') |
|
assert not pai_config, pai_config |
|
|
|
if platform == 'aml': |
|
_move_field(v1_trial, ts, 'image', 'docker_image') |
|
|
|
aml_config = v1.pop('amlConfig', {}) |
|
_move_field(aml_config, ts, 'subscriptionId', 'subscription_id') |
|
_move_field(aml_config, ts, 'resourceGroup', 'resource_group') |
|
_move_field(aml_config, ts, 'workspaceName', 'workspace_name') |
|
_move_field(aml_config, ts, 'computeTarget', 'compute_target') |
|
_move_field(aml_config, ts, 'maxTrialNumPerGpu', 'max_trial_number_per_gpu') |
|
_deprecate(aml_config, v2, 'useActiveGpu') |
|
assert not aml_config, aml_config |
|
|
|
if platform == 'kubeflow': |
|
kf_config = v1.pop('kubeflowConfig') |
|
_move_field(kf_config, ts, 'operator', 'operator') |
|
ps_name = 'ps' if ts.operator != 'pytorch-operator' else 'master' |
|
_move_field(kf_config, ts, 'apiVersion', 'api_version') |
|
|
|
|
|
storage_name = kf_config.pop('storage', None) |
|
if storage_name is None: |
|
storage_name = 'nfs' if 'nfs' in kf_config else 'azureStorage' |
|
if storage_name == 'nfs': |
|
nfs = kf_config.pop('nfs') |
|
ts.storage = KubeflowNfsConfig(server=nfs['server'], path=nfs['path']) |
|
if storage_name == 'azureStorage': |
|
key_vault = kf_config.pop('keyVault') |
|
azure_storage = kf_config.pop('azureStorage') |
|
ts.storage = KubeflowAzureStorageConfig( |
|
azure_account=azure_storage['accountName'], |
|
azure_share=azure_storage['azureShare'], |
|
key_vault=key_vault['vaultName'], |
|
key_vault_secret=key_vault['name'] |
|
) |
|
_deprecate(kf_config, v2, 'uploadRetryCount') |
|
|
|
assert not kf_config, kf_config |
|
|
|
_drop_field(v1_trial, 'nasMode') |
|
for role_name in [ps_name, 'worker']: |
|
if role_name not in v1_trial: |
|
continue |
|
v1_role = v1_trial.pop(role_name) |
|
v2_role = KubeflowRoleConfig() |
|
if role_name == 'worker': |
|
ts.worker = v2_role |
|
else: |
|
ts.parameter_server = v2_role |
|
|
|
_move_field(v1_role, v2_role, 'replicas', 'replicas') |
|
_move_field(v1_role, v2_role, 'command', 'command') |
|
_move_field(v1_role, v2_role, 'gpu_num', 'gpu_number') |
|
_move_field(v1_role, v2_role, 'cpu_num', 'cpu_number') |
|
v2_role.memory_size = str(v1_role.pop('memoryMB')) + 'mb' |
|
_move_field(v1_role, v2_role, 'image', 'docker_image') |
|
_deprecate(v1_role, v2, 'privateRegistryAuthPath') |
|
assert not v1_role, v1_role |
|
|
|
if platform == 'frameworkcontroller': |
|
fc_config = v1.pop('frameworkcontroller') |
|
_deprecate(fc_config, v2, 'serviceAccountName') |
|
|
|
storage_name = fc_config.pop('storage', None) |
|
if storage_name is None: |
|
storage_name = 'nfs' if 'nfs' in fc_config else 'azureStorage' |
|
if storage_name == 'nfs': |
|
nfs = fc_config.pop('nfs') |
|
ts.storage = KubeflowNfsConfig(server=nfs['server'], path=nfs['path']) |
|
if storage_name == 'azureStorage': |
|
key_vault = fc_config.pop('keyVault') |
|
azure_storage = fc_config.pop('azureStorage') |
|
ts.storage = KubeflowAzureStorageConfig( |
|
azure_account=azure_storage['accountName'], |
|
azure_share=azure_storage['azureShare'], |
|
key_vault=key_vault['vaultName'], |
|
key_vault_secret=key_vault['name'] |
|
) |
|
_deprecate(fc_config, v2, 'uploadRetryCount') |
|
|
|
assert not fc_config, fc_config |
|
|
|
_drop_field(v1_trial, 'nasMode') |
|
ts.task_roles = [] |
|
for v1_role in v1_trial.pop('taskRoles', []): |
|
v2_role = FrameworkControllerRoleConfig() |
|
ts.task_roles.append(v2_role) |
|
|
|
_move_field(v1_role, v2_role, 'name', 'name') |
|
_move_field(v1_role, v2_role, 'taskNum', 'task_number') |
|
policy = v1_role.pop('frameworkControllerCompletionPolicy', {}) |
|
_move_field(policy, v2_role, 'minFailedTaskCount', 'attempt_completion_min_failed_tasks') |
|
_move_field(policy, v2_role, 'minSucceededTaskCount', 'attempt_completion_min_succeeded_tasks') |
|
_move_field(v1_role, v2_role, 'command', 'command') |
|
_move_field(v1_role, v2_role, 'gpuNum', 'gpu_number') |
|
_move_field(v1_role, v2_role, 'cpuNum', 'cpu_number') |
|
v2_role.memory_size = str(v1_role.pop('memoryMB')) + 'mb' |
|
_move_field(v1_role, v2_role, 'image', 'docker_image') |
|
_deprecate(v1_role, v2, 'privateRegistryAuthPath') |
|
assert not v1_role, v1_role |
|
|
|
|
|
|
|
v1_storage = v1.pop('sharedStorage', None) |
|
if v1_storage: |
|
type_ = v1_storage.pop('storageType') |
|
if type_ == 'NFS': |
|
v2.shared_storage = NfsConfig(**v1_storage) |
|
elif type_ == 'AzureBlob': |
|
v2.shared_storage = AzureBlobConfig(**v1_storage) |
|
else: |
|
raise ValueError(f'bad storage type: {type_}') |
|
|
|
assert not v1_trial, v1_trial |
|
assert not v1, v1 |
|
return v2.canonical() |
|
|
|
def _move_field(v1, v2, v1_key, v2_key): |
|
if v1_key in v1: |
|
value = v1.pop(v1_key, None) |
|
if value is not None: |
|
setattr(v2, v2_key, value) |
|
|
|
def _drop_field(v1, key): |
|
if key in v1: |
|
logging.warning(f'Configuration field {key} is no longer supported and has been ignored') |
|
v1.pop(key) |
|
|
|
|
|
def _deprecate(v1, v2, key): |
|
if key in v1: |
|
if v2._deprecated is None: |
|
v2._deprecated = {} |
|
v2._deprecated[key] = v1.pop(key) |
|
|
|
def convert_algo(algo_type, v1, v2): |
|
if algo_type not in v1: |
|
return None |
|
v1_algo = v1.pop(algo_type) |
|
|
|
builtin_name = v1_algo.pop(f'builtin{algo_type.title()}Name', None) |
|
class_args = v1_algo.pop('classArgs', None) |
|
|
|
if builtin_name is not None: |
|
v2_algo = AlgorithmConfig(name=builtin_name, class_args=class_args) |
|
|
|
else: |
|
code_directory = util.canonical_path(v1_algo.pop('codeDir')) |
|
class_file_name = v1_algo.pop('classFileName') |
|
assert class_file_name.endswith('.py') |
|
class_name = class_file_name[:-3] + '.' + v1_algo.pop('className') |
|
v2_algo = CustomAlgorithmConfig( |
|
class_name=class_name, |
|
code_directory=code_directory, |
|
class_args=class_args |
|
) |
|
|
|
setattr(v2, algo_type, v2_algo) |
|
_deprecate(v1_algo, v2, 'includeIntermediateResults') |
|
_move_field(v1_algo, v2, 'gpuIndices', 'tuner_gpu_indices') |
|
assert not v1_algo, v1_algo |
|
return v2_algo |
|
|