Spaces:

henry000
/

YOLO

Running

henry000 commited on Jun 11, 2024

Commit

7d7e199

1 Parent(s): bce644c

🚧 [WIP] DDP for model training

Files changed (4) hide show

yolo/lazy.py CHANGED Viewed

@@ -14,19 +14,18 @@ from yolo.tools.solver import ModelTester, ModelTrainer
 from yolo.utils.bounding_box_utils import Vec2Box
 from yolo.utils.deploy_utils import FastModelLoader
 from yolo.utils.logging_utils import ProgressLogger
 @hydra.main(config_path="config", config_name="config", version_base=None)
 def main(cfg: Config):
     progress = ProgressLogger(cfg, exp_name=cfg.name)
     dataloader = create_dataloader(cfg.task.data, cfg.dataset, cfg.task.task)
-    device = torch.device(cfg.device)
     if getattr(cfg.task, "fast_inference", False):
-        model = FastModelLoader(cfg, device).load_model()
-        device = torch.device(cfg.device)
     else:
-        model = create_model(cfg.model, class_num=cfg.class_num, weight_path=cfg.weight, device=device)
     vec2box = Vec2Box(model, cfg.image_size, device)
     if cfg.task.task == "train":

 from yolo.utils.bounding_box_utils import Vec2Box
 from yolo.utils.deploy_utils import FastModelLoader
 from yolo.utils.logging_utils import ProgressLogger
+from yolo.utils.model_utils import send_to_device
 @hydra.main(config_path="config", config_name="config", version_base=None)
 def main(cfg: Config):
     progress = ProgressLogger(cfg, exp_name=cfg.name)
     dataloader = create_dataloader(cfg.task.data, cfg.dataset, cfg.task.task)
     if getattr(cfg.task, "fast_inference", False):
+        model = FastModelLoader(cfg).load_model()
     else:
+        model = create_model(cfg.model, class_num=cfg.class_num, weight_path=cfg.weight)
+    device, model = send_to_device(model, cfg.device)
     vec2box = Vec2Box(model, cfg.image_size, device)
     if cfg.task.task == "train":

yolo/model/yolo.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
 import torch
 from loguru import logger
 from omegaconf import ListConfig, OmegaConf
-from torch import device, nn
 from yolo.config.config import Config, ModelConfig, YOLOLayer
 from yolo.tools.dataset_preparation import prepare_weight
@@ -117,7 +117,7 @@ class YOLO(nn.Module):
             raise ValueError(f"Unsupported layer type: {layer_type}")
-def create_model(model_cfg: ModelConfig, weight_path: Optional[str], device: device, class_num: int = 80) -> YOLO:
     """Constructs and returns a model from a Dictionary configuration file.
     Args:
@@ -134,9 +134,10 @@ def create_model(model_cfg: ModelConfig, weight_path: Optional[str], device: dev
             logger.info(f"🌐 Weight {weight_path} not found, try downloading")
             prepare_weight(weight_path=weight_path)
         if os.path.exists(weight_path):
-            model.model.load_state_dict(torch.load(weight_path, map_location=device), strict=False)
             logger.info("✅ Success load model weight")
     log_model_structure(model.model)
     draw_model(model=model)
-    return model.to(device)

 import torch
 from loguru import logger
 from omegaconf import ListConfig, OmegaConf
+from torch import nn
 from yolo.config.config import Config, ModelConfig, YOLOLayer
 from yolo.tools.dataset_preparation import prepare_weight
             raise ValueError(f"Unsupported layer type: {layer_type}")
+def create_model(model_cfg: ModelConfig, weight_path: Optional[str], class_num: int = 80) -> YOLO:
     """Constructs and returns a model from a Dictionary configuration file.
     Args:
             logger.info(f"🌐 Weight {weight_path} not found, try downloading")
             prepare_weight(weight_path=weight_path)
         if os.path.exists(weight_path):
+            # TODO: fix map_location
+            model.model.load_state_dict(torch.load(weight_path), strict=False)
             logger.info("✅ Success load model weight")
     log_model_structure(model.model)
     draw_model(model=model)
+    return model

yolo/utils/deploy_utils.py CHANGED Viewed

@@ -9,9 +9,8 @@ from yolo.model.yolo import create_model
 class FastModelLoader:
-    def __init__(self, cfg: Config, device):
         self.cfg = cfg
-        self.device = device
         self.compiler = cfg.task.fast_inference
         self._validate_compiler()
         self.model_path = f"{os.path.splitext(cfg.weight)[0]}.{self.compiler}"

 class FastModelLoader:
+    def __init__(self, cfg: Config):
         self.cfg = cfg
         self.compiler = cfg.task.fast_inference
         self._validate_compiler()
         self.model_path = f"{os.path.splitext(cfg.weight)[0]}.{self.compiler}"

yolo/utils/model_utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
-from typing import Any, Dict, Type
 import torch
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, SequentialLR, _LRScheduler
@@ -67,3 +71,31 @@ def create_scheduler(optimizer: Optimizer, schedule_cfg: SchedulerConfig) -> _LR
         warmup_schedule = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2, lambda1])
         schedule = SequentialLR(optimizer, schedulers=[warmup_schedule, schedule], milestones=[2])
     return schedule

+from typing import Any, Dict, List, Type, Union
 import torch
+import torch.distributed as dist
+from omegaconf import ListConfig
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, SequentialLR, _LRScheduler
         warmup_schedule = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2, lambda1])
         schedule = SequentialLR(optimizer, schedulers=[warmup_schedule, schedule], milestones=[2])
     return schedule
+def get_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        return torch.device("mps")
+    else:
+        return torch.device("cpu")
+def send_to_device(model: nn.Module, device: Union[str, int, List[int]]):
+    if not isinstance(device, (List, ListConfig)):
+        device = torch.device(device)
+        print("runing man")
+        return device, model.to(device)
+    device = torch.device("cuda")
+    world_size = dist.get_world_size()
+    print("runing man")
+    dist.init_process_group(
+        backend="gloo" if torch.cuda.is_available() else "gloo", rank=dist.get_rank(), world_size=world_size
+    )
+    print(f"Initialized process group; rank: {dist.get_rank()}, size: {world_size}")
+    model = model.cuda(device)
+    model = DDP(model, device_ids=[device])
+    return device, model.to(device)