crystal-technologies's picture
Upload 303 files
de4ade4
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
"""Implements a Hugging Face T5 wrapped inside a :class:`.ComposerModel`."""
from __future__ import annotations
from typing import Mapping
from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
from composer.utils import dist
from omegaconf import DictConfig
from transformers import (AutoConfig, PreTrainedTokenizerBase,
T5ForConditionalGeneration)
from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
init_empty_weights)
__all__ = ['ComposerHFT5']
# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100
class ComposerHFT5(HuggingFaceModelWithZLoss):
"""Configures a :class:`.HuggingFaceModel` around a T5.
Note: This function uses `transformers.T5ForConditionalGeneration`. Future releases
will expand support to more general classes of HF Encoder-Decoder models.
Args:
cfg (DictConfig): An omegaconf dictionary used to configure the model:
cfg.pretrained_model_name_or_path (str): The name of or local path to
the HF model (e.g., `t5-base` to instantiate a T5 using the base config).
cfg.config_overrides (dict, optional): An optional dictionary of keyword
arguments that override the default configuration associated with
cfg.pretrained_model_name_or_path. Default: ``{}``.
cfg.pretrained (bool): Whether to instantiate the model with pre-trained
weights coming from cfg.pretrained_model_name_or_path. If ``True``,
cfg.config_overrides must be compatible with the pre-trained weights.
cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
initialize the model on. Currently, `meta` is only supported when
cfg.pretrained is ``False``. Default: ``'cpu'``.
cfg.z_loss (float, optional): The coefficient of the z-loss. If >0.0, this
the z-loss will be multiplied by this value before being added to the
standard loss term. Default: ``0.0``.
cfg.adapt_vocab_for_denoising (bool, optional): Whether to adapt the vocab
of the model/tokenizer to include sentinel tokens that are used in denoising
tasks like Span Corruption. If you intend to load from an existing Composer
checkpoint that was trained on such a task, set this to ``True`` to ensure
that the model vocab size matches your checkpoint's vocab size when loading
the weights. Default: ``False``.
tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
"""
def __init__(self, om_model_config: DictConfig,
tokenizer: PreTrainedTokenizerBase):
config = AutoConfig.from_pretrained(
om_model_config.pretrained_model_name_or_path,
trust_remote_code=om_model_config.get('trust_remote_code', True),
use_auth_token=om_model_config.get('use_auth_token', False),
)
# set config overrides
for k, v in om_model_config.get('config_overrides', {}).items():
if not hasattr(config, k):
raise ValueError(
f'config does not have attribute "{k}" to override ({k}: {v}).'
)
attr = getattr(config, k)
if isinstance(attr, Mapping):
extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
if extra_keys:
raise ValueError(
f'Config dict override got unknown keys. ' +
f'Extra keys: {extra_keys}. ' +
f'Expected (a subset of) keys: {list(attr.keys())}.')
getattr(config, k).update(v)
else:
setattr(config, k, v)
if not config.is_encoder_decoder:
raise ValueError(f'Model type "hf_t5" currently only supports T5 models ' +\
f'using configs where `is_encoder_decoder` is ``True``.')
# Set up the tokenizer (add tokens for denoising sentinels if needed)
if om_model_config.get('adapt_vocab_for_denoising', False):
adapt_tokenizer_for_denoising(tokenizer)
init_device = om_model_config.get('init_device', 'cpu')
# Get the device we want to initialize, and use the
# resolved version to initialize the HF model
resolved_init_device = hf_get_init_device(init_device)
# We need to have all non-zero local ranks be not-pretrained
# Rank 0 will still be pretrained, and distribute the weights appropriately
if dist.get_local_rank() != 0 and init_device == 'mixed':
om_model_config.pretrained = False
if resolved_init_device == 'cpu':
if om_model_config.pretrained:
model = T5ForConditionalGeneration.from_pretrained(
om_model_config.pretrained_model_name_or_path,
config=config)
else:
model = T5ForConditionalGeneration(config)
elif resolved_init_device == 'meta':
if om_model_config.pretrained:
raise ValueError(
'Setting cfg.pretrained=True is not supported when init_device="meta".'
)
with init_empty_weights(include_buffers=False):
model = T5ForConditionalGeneration(config)
else:
raise ValueError(
f'init_device="{init_device}" must be either "cpu" or "meta".')
metrics = [
LanguageCrossEntropy(ignore_index=_HF_IGNORE_INDEX),
MaskedAccuracy(ignore_index=_HF_IGNORE_INDEX)
]
composer_model = super().__init__(model=model,
tokenizer=tokenizer,
metrics=metrics,
z_loss=om_model_config.get(
'z_loss', 0.0),
init_device=init_device)
return composer_model