Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,140 Bytes
ef0f225 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# import debugpy
# import torch.distributed as dist
# import os
# # Determine the rank of the current process
# rank = int(os.environ.get("RANK", 0))
# # Attach debugger to a specific rank, e.g., rank 0
# if rank == 0:
# debugpy.listen(("localhost", 10002)) # Choose an available port
# print("Waiting for debugger attach...")
# debugpy.wait_for_client()
# print("Debugger attached, continuing execution...")
import argparse
import os
import random
import sys
# Get the directory of the current file
# current_dir = os.path.dirname(os.path.abspath(__file__))
# print(current_dir)
# sys.path.append(current_dir)
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import hawk.tasks as tasks
from hawk.common.config import Config
from hawk.common.dist_utils import get_rank, init_distributed_mode
from hawk.common.logger import setup_logger
from hawk.common.optims import (
LinearWarmupCosineLRScheduler,
LinearWarmupStepLRScheduler,
)
from hawk.common.registry import registry
from hawk.common.utils import now
# imports modules for registration
from hawk.datasets.builders import *
from hawk.models import *
from hawk.processors import *
from hawk.runners import *
from hawk.tasks import *
def parse_args():
parser = argparse.ArgumentParser(description="Training")
parser.add_argument("--cfg-path", required=False, default="/remote-home/share/jiaqitang/Hawk/train_configs/visionbranch_stage2_finetune.yaml", help="path to configuration file.")
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
# if 'LOCAL_RANK' not in os.environ:
# os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def get_runner_class(cfg):
"""
Get runner class from config. Default to epoch-based runner.
"""
runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base"))
return runner_cls
def main():
# allow auto-dl completes on main process without timeout when using NCCL backend.
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
job_id = now()
cfg = Config(parse_args())
init_distributed_mode(cfg.run_cfg)
setup_seeds(cfg)
# set after init_distributed_mode() to only log on master.
setup_logger()
cfg.pretty_print()
task = tasks.setup_task(cfg)
datasets = task.build_datasets(cfg)
# datasets['webvid']['train'][0]
# datasets
model = task.build_model(cfg)
runner = get_runner_class(cfg)(
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
)
runner.train()
if __name__ == "__main__":
main()
|