add support for multi-GPU training and evaluation
Browse files- README.md +13 -0
- configs/metadata.json +2 -1
- configs/multi_gpu_evaluate.json +28 -0
- configs/multi_gpu_train.json +40 -0
- docs/README.md +13 -0
README.md
CHANGED
@@ -58,12 +58,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
|
|
58 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
59 |
```
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
#### Override the `train` config to execute evaluation with the trained model:
|
62 |
|
63 |
```
|
64 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
65 |
```
|
66 |
|
|
|
|
|
|
|
|
|
|
|
67 |
#### Execute inference:
|
68 |
|
69 |
```
|
|
|
58 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
59 |
```
|
60 |
|
61 |
+
#### Override the `train` config to execute multi-GPU training:
|
62 |
+
|
63 |
+
```
|
64 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
|
65 |
+
```
|
66 |
+
|
67 |
+
Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
|
68 |
+
|
69 |
#### Override the `train` config to execute evaluation with the trained model:
|
70 |
|
71 |
```
|
72 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
73 |
```
|
74 |
|
75 |
+
#### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
|
76 |
+
|
77 |
+
```
|
78 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
|
79 |
+
|
80 |
#### Execute inference:
|
81 |
|
82 |
```
|
configs/metadata.json
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
{
|
2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
|
3 |
-
"version": "0.
|
4 |
"changelog": {
|
|
|
5 |
"0.3.2": "restructure readme to match updated template",
|
6 |
"0.3.1": "add figures of workflow and metrics, add invert transform",
|
7 |
"0.3.0": "update dataset processing",
|
|
|
1 |
{
|
2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
|
3 |
+
"version": "0.4.0",
|
4 |
"changelog": {
|
5 |
+
"0.4.0": "add support for multi-GPU training and evaluation",
|
6 |
"0.3.2": "restructure readme to match updated template",
|
7 |
"0.3.1": "add figures of workflow and metrics, add invert transform",
|
8 |
"0.3.0": "update dataset processing",
|
configs/multi_gpu_evaluate.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"device": "$torch.device(f'cuda:{dist.get_rank()}')",
|
3 |
+
"network": {
|
4 |
+
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
5 |
+
"module": "$@network_def.to(@device)",
|
6 |
+
"device_ids": [
|
7 |
+
"@device"
|
8 |
+
]
|
9 |
+
},
|
10 |
+
"validate#sampler": {
|
11 |
+
"_target_": "DistributedSampler",
|
12 |
+
"dataset": "@validate#dataset",
|
13 |
+
"even_divisible": false,
|
14 |
+
"shuffle": false
|
15 |
+
},
|
16 |
+
"validate#dataloader#sampler": "@validate#sampler",
|
17 |
+
"validate#handlers#1#_disabled_": "$dist.get_rank() > 0",
|
18 |
+
"evaluating": [
|
19 |
+
"$import torch.distributed as dist",
|
20 |
+
"$dist.init_process_group(backend='nccl')",
|
21 |
+
"$torch.cuda.set_device(@device)",
|
22 |
+
"$setattr(torch.backends.cudnn, 'benchmark', True)",
|
23 |
+
"$import logging",
|
24 |
+
"$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
|
25 |
+
"$@validate#evaluator.run()",
|
26 |
+
"$dist.destroy_process_group()"
|
27 |
+
]
|
28 |
+
}
|
configs/multi_gpu_train.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"device": "$torch.device(f'cuda:{dist.get_rank()}')",
|
3 |
+
"network": {
|
4 |
+
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
5 |
+
"module": "$@network_def.to(@device)",
|
6 |
+
"device_ids": [
|
7 |
+
"@device"
|
8 |
+
],
|
9 |
+
"find_unused_parameters": true
|
10 |
+
},
|
11 |
+
"train#sampler": {
|
12 |
+
"_target_": "DistributedSampler",
|
13 |
+
"dataset": "@train#dataset",
|
14 |
+
"even_divisible": true,
|
15 |
+
"shuffle": true
|
16 |
+
},
|
17 |
+
"train#dataloader#sampler": "@train#sampler",
|
18 |
+
"train#dataloader#shuffle": false,
|
19 |
+
"train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
|
20 |
+
"validate#sampler": {
|
21 |
+
"_target_": "DistributedSampler",
|
22 |
+
"dataset": "@validate#dataset",
|
23 |
+
"even_divisible": false,
|
24 |
+
"shuffle": false
|
25 |
+
},
|
26 |
+
"validate#dataloader#sampler": "@validate#sampler",
|
27 |
+
"validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
|
28 |
+
"training": [
|
29 |
+
"$import torch.distributed as dist",
|
30 |
+
"$dist.init_process_group(backend='nccl')",
|
31 |
+
"$torch.cuda.set_device(@device)",
|
32 |
+
"$monai.utils.set_determinism(seed=123)",
|
33 |
+
"$setattr(torch.backends.cudnn, 'benchmark', True)",
|
34 |
+
"$import logging",
|
35 |
+
"$@train#trainer.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
|
36 |
+
"$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
|
37 |
+
"$@train#trainer.run()",
|
38 |
+
"$dist.destroy_process_group()"
|
39 |
+
]
|
40 |
+
}
|
docs/README.md
CHANGED
@@ -51,12 +51,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
|
|
51 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
52 |
```
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
#### Override the `train` config to execute evaluation with the trained model:
|
55 |
|
56 |
```
|
57 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
58 |
```
|
59 |
|
|
|
|
|
|
|
|
|
|
|
60 |
#### Execute inference:
|
61 |
|
62 |
```
|
|
|
51 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
52 |
```
|
53 |
|
54 |
+
#### Override the `train` config to execute multi-GPU training:
|
55 |
+
|
56 |
+
```
|
57 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
|
58 |
+
```
|
59 |
+
|
60 |
+
Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
|
61 |
+
|
62 |
#### Override the `train` config to execute evaluation with the trained model:
|
63 |
|
64 |
```
|
65 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
66 |
```
|
67 |
|
68 |
+
#### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
|
69 |
+
|
70 |
+
```
|
71 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
|
72 |
+
|
73 |
#### Execute inference:
|
74 |
|
75 |
```
|