fix the wrong GPU index issue of multi-node

Files changed (4) hide show

configs/inference.json CHANGED Viewed

@@ -9,7 +9,6 @@
     "test_json": "$@bundle_root+'/label/test_samples.json'",
     "test_fp": "$open(@test_json,'r', encoding='utf8')",
     "test_dict": "$json.load(@test_fp)",
-    "test_close": "$@test_fp.close()",
     "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
     "network_def": {
         "_target_": "SEResNet50",
@@ -110,5 +109,8 @@
     ],
     "run": [
         "[email protected]()"
     ]
 }

     "test_json": "$@bundle_root+'/label/test_samples.json'",
     "test_fp": "$open(@test_json,'r', encoding='utf8')",
     "test_dict": "$json.load(@test_fp)",
     "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
     "network_def": {
         "_target_": "SEResNet50",
     ],
     "run": [
         "[email protected]()"
+    ],
+    "finalize": [
+        "$@test_fp.close()"
     ]
 }

configs/metadata.json CHANGED Viewed

@@ -1,7 +1,8 @@
 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
-    "version": "0.4.3",
     "changelog": {
         "0.4.3": "add dataset dir example",
         "0.4.2": "update ONNX-TensorRT descriptions",
         "0.4.1": "update the model weights with the deterministic training",
@@ -22,7 +23,7 @@
         "0.1.0": "complete the first version model package",
         "0.0.1": "initialize the model package structure"
     },
-    "monai_version": "1.2.0rc6",
     "pytorch_version": "1.13.1",
     "numpy_version": "1.22.2",
     "optional_packages_version": {

 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
+    "version": "0.4.4",
     "changelog": {
+        "0.4.4": "fix the wrong GPU index issue of multi-node",
         "0.4.3": "add dataset dir example",
         "0.4.2": "update ONNX-TensorRT descriptions",
         "0.4.1": "update the model weights with the deterministic training",
         "0.1.0": "complete the first version model package",
         "0.0.1": "initialize the model package structure"
     },
+    "monai_version": "1.2.0",
     "pytorch_version": "1.13.1",
     "numpy_version": "1.22.2",
     "optional_packages_version": {

configs/multi_gpu_train.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-    "device": "$torch.device(f'cuda:{dist.get_rank()}')",
     "network": {
         "_target_": "torch.nn.parallel.DistributedDataParallel",
         "module": "$@network_def.to(@device)",
@@ -34,6 +34,8 @@
         "$@train#trainer.run()"
     ],
     "finalize": [
-        "$dist.is_initialized() and dist.destroy_process_group()"
     ]
 }

 {
+    "device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
     "network": {
         "_target_": "torch.nn.parallel.DistributedDataParallel",
         "module": "$@network_def.to(@device)",
         "$@train#trainer.run()"
     ],
     "finalize": [
+        "$dist.is_initialized() and dist.destroy_process_group()",
+        "$@train_fp.close()",
+        "$@val_fp.close()"
     ]
 }

configs/train.json CHANGED Viewed

@@ -2,7 +2,8 @@
     "imports": [
         "$import torch",
         "$import json",
-        "$import ignite"
     ],
     "bundle_root": ".",
     "ckpt_dir": "$@bundle_root + '/models'",
@@ -12,11 +13,9 @@
     "val_json": "$@bundle_root+'/label/val_samples.json'",
     "train_fp": "$open(@train_json,'r', encoding='utf8')",
     "train_dict": "$json.load(@train_fp)",
-    "train_close": "$@train_fp.close()",
     "val_fp": "$open(@val_json,'r', encoding='utf8')",
     "val_dict": "$json.load(@val_fp)",
     "val_interval": 1,
-    "val_close": "$@val_fp.close()",
     "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
     "network_def": {
         "_target_": "SEResNet50",
@@ -256,5 +255,9 @@
     ],
     "run": [
         "$@train#trainer.run()"
     ]
 }

     "imports": [
         "$import torch",
         "$import json",
+        "$import ignite",
+        "$import os"
     ],
     "bundle_root": ".",
     "ckpt_dir": "$@bundle_root + '/models'",
     "val_json": "$@bundle_root+'/label/val_samples.json'",
     "train_fp": "$open(@train_json,'r', encoding='utf8')",
     "train_dict": "$json.load(@train_fp)",
     "val_fp": "$open(@val_json,'r', encoding='utf8')",
     "val_dict": "$json.load(@val_fp)",
     "val_interval": 1,
     "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
     "network_def": {
         "_target_": "SEResNet50",
     ],
     "run": [
         "$@train#trainer.run()"
+    ],
+    "finalize": [
+        "$@train_fp.close()",
+        "$@val_fp.close()"
     ]
 }