asgeorges's picture
Upload folder using huggingface_hub
2fdce3c
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import argparse
from functools import reduce
import logging
import operator
import datasets.wikitext2_data as wikitext2_data
from models import transformer_lm
import numpy as np
import torch
from torch.optim import Adam
def init_random_seed(seed: int):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
def init_args():
parser = argparse.ArgumentParser(description="benchmark")
parser.add_argument("--host", "-o", type=str, default="localhost", help="hostname")
parser.add_argument("--chunks", type=int, default=1, help="number of microbatches per batch")
parser.add_argument("--batch-size", type=int, default=8, help="size of a batch")
parser.add_argument(
"--checkpoint",
default="never",
choices=["always", "except_last", "never"],
help="Checkpointing strategy for pipe",
)
parser.add_argument(
"--lazy-construction", action="store_true", default=False, help="Number of decoder layers in the model"
)
parser.add_argument("--max-batch", type=int, default=4, help="Max number of batches")
parser.add_argument("--use_synthetic_data", action="store_true", help="Uses synthetic data for running benchmarks.")
parser.add_argument("--dry_run", action="store_true", help="Run a sample training run without regression testing.")
parser.add_argument(
# TODO(anj-s): In the process of adding more models and hence the requirement for a flag.
"--model_name",
default="lm",
help="Language Model(LM) used to benchmark nn.pipe.",
)
parser.add_argument("--debug", action="store_true", default=False, help="Display additional debug information")
args = parser.parse_args()
return args
def create_benchmark_config(model_name, config_class):
"""Return a dict with configurations required for benchmarking `model_name` model."""
if model_name == "lm":
return config_class.get_benchmark_config()
else:
raise RuntimeError("Unrecognized args.model_mame " % args.model_name)
def get_model_specs(model_name, config_class):
"""Return a dict with configurations required for configuring `model_name` model."""
if model_name == "lm":
return config_class.get_model_config()
else:
raise RuntimeError("Unrecognized args.model_mame " % model_name)
def create_model_config(args, benchmark_config=None, model_specs=None, device=None):
"""Return a dict with the given model, dataset and optimizer."""
if not device:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
dataset_info = get_dataset_info(args)
assert model_specs is not None
model_specs["vocab_size"] = dataset_info.ntokens
model, optimizer = get_model_and_optimizer(args, device, benchmark_config, model_specs)
return {
"model": model,
"optimizer": optimizer,
"dataset_info": dataset_info,
}
def get_model_and_optimizer(args, device, benchmark_config, model_config):
"""Return instantiated model and optimizer function."""
if args.model_name == "lm":
model = get_lm_model(args, device, model_config)
lr = benchmark_config["lr"]
def make_adam(params):
return Adam(params, lr=lr)
optimizer = make_adam
return model, optimizer
def get_lm_model(args, device, config):
"""Get language model(based on GPT-2) used for sequence prediction."""
ninp = config["ninp"]
nhead = config["nhead"]
initrange = config["initrange"]
dropout = config["dropout"]
vocab_size = config["vocab_size"]
nhid = config["nhid"]
ndecoder = config["num_decoder_layers"]
is_moe = config.get("is_moe", False)
num_local_experts = config.get("num_local_experts", 1)
if args.lazy_construction:
layers = [
LazyModule(lambda: transformer_lm.EmbeddingLayer(vocab_size, ninp, initrange)),
LazyModule(lambda: transformer_lm.PositionalEncodingLayer(ninp, dropout)),
]
for _ in range(ndecoder):
layers.append(
LazyModule(
lambda: transformer_lm.TransformerDecoderLayer(
ninp, nhead, nhid, dropout, is_moe, num_local_experts
)
)
)
layers.append(LazyModule(lambda: transformer_lm.LinearLayer(ninp, vocab_size, initrange)))
model = layers
else:
model = transformer_lm.TransformerLM(
vocab_size, ninp, nhead, nhid, dropout, initrange, ndecoder, is_moe, num_local_experts
).to(device)
return model
def log_number_of_parameters(model, logger=None):
if not logger:
logger = logging
num_params = reduce(operator.add, (reduce(operator.mul, x.size()) for x in model.parameters()))
if hasattr(model, "group"):
total = torch.Tensor([num_params])
if torch.cuda.is_available():
total = total.cuda()
torch.distributed.all_reduce(total, group=model.group)
logger.debug(
f"training model, #params = {num_params}, group: {model.group.rank()}, grank:"
f" {torch.distributed.get_rank()}, sizes {model.group.size()}"
)
torch.distributed.barrier()
if model.group.rank() == 0:
logger.debug(f"total #prams = {total.item()}")
else:
logger.debug(f"training model, #params = {num_params}")
def get_dataset_info(args):
assert args.model_name == "lm"
if args.use_synthetic_data:
return wikitext2_data.get_synthetic_datasets()
else:
return wikitext2_data.get_real_datasets()
def get_data_loader(dataset_info, args, benchmark_config, model_specs, num_replicas=1, rank=0):
return wikitext2_data.get_dataloaders(dataset_info, benchmark_config, model_specs, num_replicas, rank)