|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Megatron global variables.""" |
|
|
|
import os |
|
import sys |
|
import time |
|
from functools import reduce |
|
import operator |
|
import torch |
|
|
|
from megatron import dist_signal_handler |
|
from megatron.tokenizer import build_tokenizer |
|
from .microbatches import build_num_microbatches_calculator |
|
|
|
_GLOBAL_ARGS = None |
|
_GLOBAL_NUM_MICROBATCHES_CALCULATOR = None |
|
_GLOBAL_TOKENIZER = None |
|
_GLOBAL_TENSORBOARD_WRITER = None |
|
_GLOBAL_ADLR_AUTORESUME = None |
|
_GLOBAL_TIMERS = None |
|
_GLOBAL_SIGNAL_HANDLER = None |
|
_GLOBAL_MEMORY_BUFFER = None |
|
|
|
def get_args(): |
|
"""Return arguments.""" |
|
_ensure_var_is_initialized(_GLOBAL_ARGS, 'args') |
|
return _GLOBAL_ARGS |
|
|
|
|
|
def get_num_microbatches(): |
|
return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get() |
|
|
|
|
|
def get_current_global_batch_size(): |
|
return _GLOBAL_NUM_MICROBATCHES_CALCULATOR.get_current_global_batch_size() |
|
|
|
|
|
def update_num_microbatches(consumed_samples, consistency_check=True): |
|
_GLOBAL_NUM_MICROBATCHES_CALCULATOR.update(consumed_samples, |
|
consistency_check) |
|
|
|
|
|
def get_tokenizer(): |
|
"""Return tokenizer.""" |
|
_ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer') |
|
return _GLOBAL_TOKENIZER |
|
|
|
|
|
def get_tensorboard_writer(): |
|
"""Return tensorboard writer. It can be None so no need |
|
to check if it is initialized.""" |
|
return _GLOBAL_TENSORBOARD_WRITER |
|
|
|
|
|
def get_adlr_autoresume(): |
|
"""ADLR autoresume object. It can be None so no need |
|
to check if it is initialized.""" |
|
return _GLOBAL_ADLR_AUTORESUME |
|
|
|
|
|
def get_timers(): |
|
"""Return timers.""" |
|
_ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers') |
|
return _GLOBAL_TIMERS |
|
|
|
|
|
def get_signal_handler(): |
|
_ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') |
|
return _GLOBAL_SIGNAL_HANDLER |
|
|
|
|
|
def get_global_memory_buffer(): |
|
_ensure_var_is_initialized(_GLOBAL_MEMORY_BUFFER, 'global memory buffer') |
|
return _GLOBAL_MEMORY_BUFFER |
|
|
|
|
|
def _set_signal_handler(): |
|
global _GLOBAL_SIGNAL_HANDLER |
|
_ensure_var_is_not_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler') |
|
_GLOBAL_SIGNAL_HANDLER = dist_signal_handler.DistributedSignalHandler().__enter__() |
|
|
|
|
|
|
|
def set_global_variables(args): |
|
"""Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers.""" |
|
|
|
assert args is not None |
|
|
|
_ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args') |
|
set_args(args) |
|
|
|
_build_num_microbatches_calculator(args) |
|
if args.vocab_file: |
|
_ = _build_tokenizer(args) |
|
_set_tensorboard_writer(args) |
|
_set_adlr_autoresume(args) |
|
_set_timers() |
|
_set_global_memory_buffer() |
|
|
|
if args.exit_signal_handler: |
|
_set_signal_handler() |
|
|
|
|
|
def set_args(args): |
|
global _GLOBAL_ARGS |
|
_GLOBAL_ARGS = args |
|
|
|
|
|
def _build_num_microbatches_calculator(args): |
|
|
|
global _GLOBAL_NUM_MICROBATCHES_CALCULATOR |
|
_ensure_var_is_not_initialized(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, |
|
'num microbatches calculator') |
|
|
|
_GLOBAL_NUM_MICROBATCHES_CALCULATOR = build_num_microbatches_calculator( |
|
args) |
|
|
|
|
|
def _build_tokenizer(args): |
|
"""Initialize tokenizer.""" |
|
global _GLOBAL_TOKENIZER |
|
_ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer') |
|
_GLOBAL_TOKENIZER = build_tokenizer(args) |
|
return _GLOBAL_TOKENIZER |
|
|
|
|
|
def rebuild_tokenizer(args): |
|
global _GLOBAL_TOKENIZER |
|
_GLOBAL_TOKENIZER = None |
|
return _build_tokenizer(args) |
|
|
|
|
|
def _set_tensorboard_writer(args): |
|
"""Set tensorboard writer.""" |
|
global _GLOBAL_TENSORBOARD_WRITER |
|
_ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, |
|
'tensorboard writer') |
|
|
|
if hasattr(args, 'tensorboard_dir') and \ |
|
args.tensorboard_dir and args.rank == (args.world_size - 1): |
|
try: |
|
from torch.utils.tensorboard import SummaryWriter |
|
print('> setting tensorboard ...') |
|
_GLOBAL_TENSORBOARD_WRITER = SummaryWriter( |
|
log_dir=args.tensorboard_dir, |
|
max_queue=args.tensorboard_queue_size) |
|
except ModuleNotFoundError: |
|
print('WARNING: TensorBoard writing requested but is not ' |
|
'available (are you using PyTorch 1.1.0 or later?), ' |
|
'no TensorBoard logs will be written.', flush=True) |
|
|
|
|
|
def _set_adlr_autoresume(args): |
|
"""Initialize ADLR autoresume.""" |
|
global _GLOBAL_ADLR_AUTORESUME |
|
_ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume') |
|
|
|
if args.adlr_autoresume: |
|
if args.rank == 0: |
|
print('enabling autoresume ...', flush=True) |
|
sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.')) |
|
try: |
|
from userlib.auto_resume import AutoResume |
|
except BaseException: |
|
print('ADLR autoresume is not available, exiting ...') |
|
sys.exit() |
|
|
|
_GLOBAL_ADLR_AUTORESUME = AutoResume |
|
|
|
|
|
def _set_timers(): |
|
"""Initialize timers.""" |
|
global _GLOBAL_TIMERS |
|
_ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers') |
|
_GLOBAL_TIMERS = Timers() |
|
|
|
def _set_global_memory_buffer(): |
|
"""Initialize global buffer""" |
|
global _GLOBAL_MEMORY_BUFFER |
|
_ensure_var_is_not_initialized(_GLOBAL_MEMORY_BUFFER, 'global memory buffer') |
|
_GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer() |
|
|
|
|
|
def _ensure_var_is_initialized(var, name): |
|
"""Make sure the input variable is not None.""" |
|
assert var is not None, '{} is not initialized.'.format(name) |
|
|
|
|
|
def _ensure_var_is_not_initialized(var, name): |
|
"""Make sure the input variable is not None.""" |
|
assert var is None, '{} is already initialized.'.format(name) |
|
|
|
|
|
class _Timer: |
|
"""Timer.""" |
|
|
|
def __init__(self, name): |
|
self.name_ = name |
|
self.elapsed_ = 0.0 |
|
self.started_ = False |
|
self.start_time = time.time() |
|
|
|
def start(self): |
|
"""Start the timer.""" |
|
assert not self.started_, 'timer has already been started' |
|
torch.cuda.synchronize() |
|
self.start_time = time.time() |
|
self.started_ = True |
|
|
|
def stop(self): |
|
"""Stop the timer.""" |
|
assert self.started_, 'timer is not started' |
|
torch.cuda.synchronize() |
|
self.elapsed_ += (time.time() - self.start_time) |
|
self.started_ = False |
|
|
|
def reset(self): |
|
"""Reset timer.""" |
|
self.elapsed_ = 0.0 |
|
self.started_ = False |
|
|
|
def elapsed(self, reset=True): |
|
"""Calculate the elapsed time.""" |
|
started_ = self.started_ |
|
|
|
if self.started_: |
|
self.stop() |
|
|
|
elapsed_ = self.elapsed_ |
|
|
|
if reset: |
|
self.reset() |
|
|
|
if started_: |
|
self.start() |
|
return elapsed_ |
|
|
|
|
|
class Timers: |
|
"""Group of timers.""" |
|
|
|
def __init__(self): |
|
self.timers = {} |
|
|
|
def __call__(self, name): |
|
if name not in self.timers: |
|
self.timers[name] = _Timer(name) |
|
return self.timers[name] |
|
|
|
def write(self, names, writer, iteration, normalizer=1.0, reset=False): |
|
"""Write timers to a tensorboard writer""" |
|
|
|
|
|
|
|
assert normalizer > 0.0 |
|
for name in names: |
|
value = self.timers[name].elapsed(reset=reset) / normalizer |
|
writer.add_scalar(name + '-time', value, iteration) |
|
|
|
def log(self, names, normalizer=1.0, reset=True): |
|
"""Log a group of timers.""" |
|
assert normalizer > 0.0 |
|
string = 'time (ms)' |
|
for name in names: |
|
elapsed_time = self.timers[name].elapsed( |
|
reset=reset) * 1000.0 / normalizer |
|
string += ' | {}: {:.2f}'.format(name, elapsed_time) |
|
if torch.distributed.is_initialized(): |
|
if torch.distributed.get_rank() == ( |
|
torch.distributed.get_world_size() - 1): |
|
print(string, flush=True) |
|
else: |
|
print(string, flush=True) |
|
|
|
|
|
class GlobalMemoryBuffer: |
|
"""Global buffer to avoid dynamic memory allocations. |
|
Caller should ensure that buffers of the same name |
|
are not used concurrently.""" |
|
|
|
def __init__(self): |
|
self.buffer = {} |
|
|
|
def get_tensor(self, tensor_shape, dtype, name): |
|
required_len = reduce(operator.mul, tensor_shape, 1) |
|
if self.buffer.get((name, dtype), None) is None or \ |
|
self.buffer[(name, dtype)].numel() < required_len: |
|
self.buffer[(name, dtype)] = \ |
|
torch.empty(required_len, |
|
dtype=dtype, |
|
device=torch.cuda.current_device(), |
|
requires_grad=False) |
|
|
|
return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape) |
|
|