monai
medical
katielink commited on
Commit
9754d0b
·
1 Parent(s): b27ec38

fix the wrong GPU index issue of multi-node

Browse files
configs/inference.json CHANGED
@@ -9,7 +9,6 @@
9
  "test_json": "$@bundle_root+'/label/test_samples.json'",
10
  "test_fp": "$open(@test_json,'r', encoding='utf8')",
11
  "test_dict": "$json.load(@test_fp)",
12
- "test_close": "$@test_fp.close()",
13
  "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
14
  "network_def": {
15
  "_target_": "SEResNet50",
@@ -110,5 +109,8 @@
110
  ],
111
  "run": [
112
 
 
 
113
  ]
114
  }
 
9
  "test_json": "$@bundle_root+'/label/test_samples.json'",
10
  "test_fp": "$open(@test_json,'r', encoding='utf8')",
11
  "test_dict": "$json.load(@test_fp)",
 
12
  "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
13
  "network_def": {
14
  "_target_": "SEResNet50",
 
109
  ],
110
  "run": [
111
112
+ ],
113
+ "finalize": [
114
+ "$@test_fp.close()"
115
  ]
116
  }
configs/metadata.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
  "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
3
- "version": "0.4.3",
4
  "changelog": {
 
5
  "0.4.3": "add dataset dir example",
6
  "0.4.2": "update ONNX-TensorRT descriptions",
7
  "0.4.1": "update the model weights with the deterministic training",
@@ -22,7 +23,7 @@
22
  "0.1.0": "complete the first version model package",
23
  "0.0.1": "initialize the model package structure"
24
  },
25
- "monai_version": "1.2.0rc6",
26
  "pytorch_version": "1.13.1",
27
  "numpy_version": "1.22.2",
28
  "optional_packages_version": {
 
1
  {
2
  "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
3
+ "version": "0.4.4",
4
  "changelog": {
5
+ "0.4.4": "fix the wrong GPU index issue of multi-node",
6
  "0.4.3": "add dataset dir example",
7
  "0.4.2": "update ONNX-TensorRT descriptions",
8
  "0.4.1": "update the model weights with the deterministic training",
 
23
  "0.1.0": "complete the first version model package",
24
  "0.0.1": "initialize the model package structure"
25
  },
26
+ "monai_version": "1.2.0",
27
  "pytorch_version": "1.13.1",
28
  "numpy_version": "1.22.2",
29
  "optional_packages_version": {
configs/multi_gpu_train.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "device": "$torch.device(f'cuda:{dist.get_rank()}')",
3
  "network": {
4
  "_target_": "torch.nn.parallel.DistributedDataParallel",
5
  "module": "$@network_def.to(@device)",
@@ -34,6 +34,8 @@
34
  "$@train#trainer.run()"
35
  ],
36
  "finalize": [
37
- "$dist.is_initialized() and dist.destroy_process_group()"
 
 
38
  ]
39
  }
 
1
  {
2
+ "device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
3
  "network": {
4
  "_target_": "torch.nn.parallel.DistributedDataParallel",
5
  "module": "$@network_def.to(@device)",
 
34
  "$@train#trainer.run()"
35
  ],
36
  "finalize": [
37
+ "$dist.is_initialized() and dist.destroy_process_group()",
38
+ "$@train_fp.close()",
39
+ "$@val_fp.close()"
40
  ]
41
  }
configs/train.json CHANGED
@@ -2,7 +2,8 @@
2
  "imports": [
3
  "$import torch",
4
  "$import json",
5
- "$import ignite"
 
6
  ],
7
  "bundle_root": ".",
8
  "ckpt_dir": "$@bundle_root + '/models'",
@@ -12,11 +13,9 @@
12
  "val_json": "$@bundle_root+'/label/val_samples.json'",
13
  "train_fp": "$open(@train_json,'r', encoding='utf8')",
14
  "train_dict": "$json.load(@train_fp)",
15
- "train_close": "$@train_fp.close()",
16
  "val_fp": "$open(@val_json,'r', encoding='utf8')",
17
  "val_dict": "$json.load(@val_fp)",
18
  "val_interval": 1,
19
- "val_close": "$@val_fp.close()",
20
  "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
21
  "network_def": {
22
  "_target_": "SEResNet50",
@@ -256,5 +255,9 @@
256
  ],
257
  "run": [
258
  "$@train#trainer.run()"
 
 
 
 
259
  ]
260
  }
 
2
  "imports": [
3
  "$import torch",
4
  "$import json",
5
+ "$import ignite",
6
+ "$import os"
7
  ],
8
  "bundle_root": ".",
9
  "ckpt_dir": "$@bundle_root + '/models'",
 
13
  "val_json": "$@bundle_root+'/label/val_samples.json'",
14
  "train_fp": "$open(@train_json,'r', encoding='utf8')",
15
  "train_dict": "$json.load(@train_fp)",
 
16
  "val_fp": "$open(@val_json,'r', encoding='utf8')",
17
  "val_dict": "$json.load(@val_fp)",
18
  "val_interval": 1,
 
19
  "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
20
  "network_def": {
21
  "_target_": "SEResNet50",
 
255
  ],
256
  "run": [
257
  "$@train#trainer.run()"
258
+ ],
259
+ "finalize": [
260
+ "$@train_fp.close()",
261
+ "$@val_fp.close()"
262
  ]
263
  }