|
|
|
|
|
|
|
"""Implements a Hugging Prefix LM wrapped inside a :class:`.ComposerModel`.""" |
|
|
|
from __future__ import annotations |
|
|
|
from typing import Mapping, MutableMapping |
|
|
|
from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy |
|
from composer.utils import dist |
|
from omegaconf import DictConfig |
|
from transformers import (AutoConfig, AutoModelForCausalLM, |
|
PreTrainedTokenizerBase) |
|
|
|
from llmfoundry.models.hf.hf_fsdp import hf_get_init_device |
|
from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss |
|
from llmfoundry.models.utils import (adapt_tokenizer_for_denoising, |
|
add_bidirectional_mask_if_missing, |
|
convert_hf_causal_lm_to_prefix_lm, |
|
init_empty_weights) |
|
|
|
__all__ = ['ComposerHFPrefixLM'] |
|
|
|
|
|
_HF_IGNORE_INDEX = -100 |
|
|
|
|
|
class ComposerHFPrefixLM(HuggingFaceModelWithZLoss): |
|
"""Configures a :class:`.HuggingFaceModel` around a Prefix LM. |
|
|
|
Note: HuggingFace does not natively support Prefix LM-style models. This function uses |
|
`transformers.AutoModelForCausalLM` to instantiate a Causal LM, then uses a conversion utility |
|
to turn the model into a Prefix LM. Currently, that conversion utility only supports the |
|
following HuggingFace Causal LM types: |
|
- `GPT2LMHeadModel` |
|
- `GPTNeoForCausalLM` |
|
- `GPTNeoXForCausalLM` |
|
- `GPTJForCausalLM` |
|
- `BloomForCausalLM` |
|
- `OPTForCausalLM` |
|
|
|
Args: |
|
cfg (DictConfig): An omegaconf dictionary used to configure the model: |
|
cfg.pretrained_model_name_or_path (str): The name of or local path to |
|
the HF model (e.g., `gpt2` to instantiate a GPT2LMHeadModel). The model |
|
will be converted to a Prefix LM during initialization. |
|
cfg.config_overrides (dict, optional): An optional dictionary of keyword |
|
arguments that override the default configuration associated with |
|
cfg.pretrained_model_name_or_path. Default: ``{}``. |
|
cfg.pretrained (bool): Whether to instantiate the model with pre-trained |
|
weights coming from cfg.pretrained_model_name_or_path. If ``True``, |
|
cfg.config_overrides must be compatible with the pre-trained weights. |
|
cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to |
|
initialize the model on. Currently, `meta` is only supported when |
|
cfg.pretrained is ``False``. Default: ``'cpu'``. |
|
cfg.z_loss (float, optional): The coefficient of the z-loss. If >0.0, this |
|
the z-loss will be multiplied by this value before being added to the |
|
standard loss term. Default: ``0.0``. |
|
cfg.adapt_vocab_for_denoising (bool, optional): Whether to adapt the vocab |
|
of the model/tokenizer to include sentinel tokens that are used in denoising |
|
tasks like Span Corruption. If you intend to load from an existing Composer |
|
checkpoint that was trained on such a task, set this to ``True`` to ensure |
|
that the model vocab size matches your checkpoint's vocab size when loading |
|
the weights. Default: ``False``. |
|
tokenizer (PreTrainedTokenizer): The tokenizer that the model will use. |
|
""" |
|
|
|
def __init__(self, om_model_config: DictConfig, |
|
tokenizer: PreTrainedTokenizerBase): |
|
config = AutoConfig.from_pretrained( |
|
om_model_config.pretrained_model_name_or_path, |
|
trust_remote_code=om_model_config.get('trust_remote_code', True), |
|
use_auth_token=om_model_config.get('use_auth_token', False), |
|
) |
|
|
|
|
|
for k, v in om_model_config.get('config_overrides', {}).items(): |
|
if not hasattr(config, k): |
|
raise ValueError( |
|
f'config does not have attribute "{k}" to override ({k}: {v}).' |
|
) |
|
|
|
attr = getattr(config, k) |
|
if isinstance(attr, Mapping): |
|
extra_keys = [_k for _k in v.keys() if _k not in attr.keys()] |
|
if extra_keys: |
|
raise ValueError( |
|
f'Config dict override got unknown keys. ' + |
|
f'Extra keys: {extra_keys}. ' + |
|
f'Expected (a subset of) keys: {list(attr.keys())}.') |
|
getattr(config, k).update(v) |
|
else: |
|
setattr(config, k, v) |
|
|
|
|
|
if om_model_config.get('adapt_vocab_for_denoising', False): |
|
adapt_tokenizer_for_denoising(tokenizer) |
|
|
|
init_device = om_model_config.get('init_device', 'cpu') |
|
|
|
|
|
|
|
resolved_init_device = hf_get_init_device(init_device) |
|
|
|
|
|
|
|
if dist.get_local_rank() != 0 and init_device == 'mixed': |
|
om_model_config.pretrained = False |
|
|
|
if resolved_init_device == 'cpu': |
|
if om_model_config.pretrained: |
|
model = AutoModelForCausalLM.from_pretrained( |
|
om_model_config.pretrained_model_name_or_path, |
|
config=config) |
|
else: |
|
model = AutoModelForCausalLM.from_config(config) |
|
elif resolved_init_device == 'meta': |
|
if om_model_config.pretrained: |
|
raise ValueError( |
|
'Setting cfg.pretrained=True is not supported when init_device="meta".' |
|
) |
|
with init_empty_weights(include_buffers=False): |
|
model = AutoModelForCausalLM.from_config(config) |
|
else: |
|
raise ValueError( |
|
f'init_device="{init_device}" must be either "cpu" or "meta".') |
|
|
|
|
|
model = convert_hf_causal_lm_to_prefix_lm(model) |
|
|
|
metrics = [ |
|
LanguageCrossEntropy(ignore_index=_HF_IGNORE_INDEX), |
|
MaskedAccuracy(ignore_index=_HF_IGNORE_INDEX) |
|
] |
|
|
|
composer_model = super().__init__(model=model, |
|
shift_labels=True, |
|
tokenizer=tokenizer, |
|
metrics=metrics, |
|
z_loss=om_model_config.get( |
|
'z_loss', 0.0), |
|
init_device=init_device) |
|
|
|
return composer_model |
|
|
|
def forward(self, batch: MutableMapping): |
|
|
|
add_bidirectional_mask_if_missing(batch) |
|
return super().forward(batch) |
|
|