monai
medical
katielink commited on
Commit
378b33d
·
1 Parent(s): 4c6d8ac

add support for multi-GPU training and evaluation

Browse files
README.md CHANGED
@@ -58,12 +58,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
58
  python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
59
  ```
60
 
 
 
 
 
 
 
 
 
61
  #### Override the `train` config to execute evaluation with the trained model:
62
 
63
  ```
64
  python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
65
  ```
66
 
 
 
 
 
 
67
  #### Execute inference:
68
 
69
  ```
 
58
  python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
59
  ```
60
 
61
+ #### Override the `train` config to execute multi-GPU training:
62
+
63
+ ```
64
+ torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
65
+ ```
66
+
67
+ Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
68
+
69
  #### Override the `train` config to execute evaluation with the trained model:
70
 
71
  ```
72
  python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
73
  ```
74
 
75
+ #### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
76
+
77
+ ```
78
+ torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
79
+
80
  #### Execute inference:
81
 
82
  ```
configs/metadata.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
3
- "version": "0.3.2",
4
  "changelog": {
 
5
  "0.3.2": "restructure readme to match updated template",
6
  "0.3.1": "add figures of workflow and metrics, add invert transform",
7
  "0.3.0": "update dataset processing",
 
1
  {
2
  "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
3
+ "version": "0.4.0",
4
  "changelog": {
5
+ "0.4.0": "add support for multi-GPU training and evaluation",
6
  "0.3.2": "restructure readme to match updated template",
7
  "0.3.1": "add figures of workflow and metrics, add invert transform",
8
  "0.3.0": "update dataset processing",
configs/multi_gpu_evaluate.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "device": "$torch.device(f'cuda:{dist.get_rank()}')",
3
+ "network": {
4
+ "_target_": "torch.nn.parallel.DistributedDataParallel",
5
+ "module": "$@network_def.to(@device)",
6
+ "device_ids": [
7
+ "@device"
8
+ ]
9
+ },
10
+ "validate#sampler": {
11
+ "_target_": "DistributedSampler",
12
+ "dataset": "@validate#dataset",
13
+ "even_divisible": false,
14
+ "shuffle": false
15
+ },
16
+ "validate#dataloader#sampler": "@validate#sampler",
17
+ "validate#handlers#1#_disabled_": "$dist.get_rank() > 0",
18
+ "evaluating": [
19
+ "$import torch.distributed as dist",
20
+ "$dist.init_process_group(backend='nccl')",
21
+ "$torch.cuda.set_device(@device)",
22
+ "$setattr(torch.backends.cudnn, 'benchmark', True)",
23
+ "$import logging",
24
+ "$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
25
+ "$@validate#evaluator.run()",
26
+ "$dist.destroy_process_group()"
27
+ ]
28
+ }
configs/multi_gpu_train.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "device": "$torch.device(f'cuda:{dist.get_rank()}')",
3
+ "network": {
4
+ "_target_": "torch.nn.parallel.DistributedDataParallel",
5
+ "module": "$@network_def.to(@device)",
6
+ "device_ids": [
7
+ "@device"
8
+ ],
9
+ "find_unused_parameters": true
10
+ },
11
+ "train#sampler": {
12
+ "_target_": "DistributedSampler",
13
+ "dataset": "@train#dataset",
14
+ "even_divisible": true,
15
+ "shuffle": true
16
+ },
17
+ "train#dataloader#sampler": "@train#sampler",
18
+ "train#dataloader#shuffle": false,
19
+ "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
20
+ "validate#sampler": {
21
+ "_target_": "DistributedSampler",
22
+ "dataset": "@validate#dataset",
23
+ "even_divisible": false,
24
+ "shuffle": false
25
+ },
26
+ "validate#dataloader#sampler": "@validate#sampler",
27
+ "validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
28
+ "training": [
29
+ "$import torch.distributed as dist",
30
+ "$dist.init_process_group(backend='nccl')",
31
+ "$torch.cuda.set_device(@device)",
32
+ "$monai.utils.set_determinism(seed=123)",
33
+ "$setattr(torch.backends.cudnn, 'benchmark', True)",
34
+ "$import logging",
35
+ "$@train#trainer.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
36
+ "$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
37
+ "$@train#trainer.run()",
38
+ "$dist.destroy_process_group()"
39
+ ]
40
+ }
docs/README.md CHANGED
@@ -51,12 +51,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
51
  python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
52
  ```
53
 
 
 
 
 
 
 
 
 
54
  #### Override the `train` config to execute evaluation with the trained model:
55
 
56
  ```
57
  python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
58
  ```
59
 
 
 
 
 
 
60
  #### Execute inference:
61
 
62
  ```
 
51
  python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
52
  ```
53
 
54
+ #### Override the `train` config to execute multi-GPU training:
55
+
56
+ ```
57
+ torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
58
+ ```
59
+
60
+ Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
61
+
62
  #### Override the `train` config to execute evaluation with the trained model:
63
 
64
  ```
65
  python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
66
  ```
67
 
68
+ #### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
69
+
70
+ ```
71
+ torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
72
+
73
  #### Execute inference:
74
 
75
  ```