crystal-technologies's picture
Upload 303 files
de4ade4
raw
history blame contribute delete
7.16 kB
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
"""Implements a Hugging Prefix LM wrapped inside a :class:`.ComposerModel`."""
from __future__ import annotations
from typing import Mapping, MutableMapping
from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
from composer.utils import dist
from omegaconf import DictConfig
from transformers import (AutoConfig, AutoModelForCausalLM,
PreTrainedTokenizerBase)
from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
add_bidirectional_mask_if_missing,
convert_hf_causal_lm_to_prefix_lm,
init_empty_weights)
__all__ = ['ComposerHFPrefixLM']
# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100
class ComposerHFPrefixLM(HuggingFaceModelWithZLoss):
"""Configures a :class:`.HuggingFaceModel` around a Prefix LM.
Note: HuggingFace does not natively support Prefix LM-style models. This function uses
`transformers.AutoModelForCausalLM` to instantiate a Causal LM, then uses a conversion utility
to turn the model into a Prefix LM. Currently, that conversion utility only supports the
following HuggingFace Causal LM types:
- `GPT2LMHeadModel`
- `GPTNeoForCausalLM`
- `GPTNeoXForCausalLM`
- `GPTJForCausalLM`
- `BloomForCausalLM`
- `OPTForCausalLM`
Args:
cfg (DictConfig): An omegaconf dictionary used to configure the model:
cfg.pretrained_model_name_or_path (str): The name of or local path to
the HF model (e.g., `gpt2` to instantiate a GPT2LMHeadModel). The model
will be converted to a Prefix LM during initialization.
cfg.config_overrides (dict, optional): An optional dictionary of keyword
arguments that override the default configuration associated with
cfg.pretrained_model_name_or_path. Default: ``{}``.
cfg.pretrained (bool): Whether to instantiate the model with pre-trained
weights coming from cfg.pretrained_model_name_or_path. If ``True``,
cfg.config_overrides must be compatible with the pre-trained weights.
cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to
initialize the model on. Currently, `meta` is only supported when
cfg.pretrained is ``False``. Default: ``'cpu'``.
cfg.z_loss (float, optional): The coefficient of the z-loss. If >0.0, this
the z-loss will be multiplied by this value before being added to the
standard loss term. Default: ``0.0``.
cfg.adapt_vocab_for_denoising (bool, optional): Whether to adapt the vocab
of the model/tokenizer to include sentinel tokens that are used in denoising
tasks like Span Corruption. If you intend to load from an existing Composer
checkpoint that was trained on such a task, set this to ``True`` to ensure
that the model vocab size matches your checkpoint's vocab size when loading
the weights. Default: ``False``.
tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
"""
def __init__(self, om_model_config: DictConfig,
tokenizer: PreTrainedTokenizerBase):
config = AutoConfig.from_pretrained(
om_model_config.pretrained_model_name_or_path,
trust_remote_code=om_model_config.get('trust_remote_code', True),
use_auth_token=om_model_config.get('use_auth_token', False),
)
# set config overrides
for k, v in om_model_config.get('config_overrides', {}).items():
if not hasattr(config, k):
raise ValueError(
f'config does not have attribute "{k}" to override ({k}: {v}).'
)
attr = getattr(config, k)
if isinstance(attr, Mapping):
extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
if extra_keys:
raise ValueError(
f'Config dict override got unknown keys. ' +
f'Extra keys: {extra_keys}. ' +
f'Expected (a subset of) keys: {list(attr.keys())}.')
getattr(config, k).update(v)
else:
setattr(config, k, v)
# Set up the tokenizer (add tokens for denoising sentinels if needed)
if om_model_config.get('adapt_vocab_for_denoising', False):
adapt_tokenizer_for_denoising(tokenizer)
init_device = om_model_config.get('init_device', 'cpu')
# Get the device we want to initialize, and use the
# resolved version to initialize the HF model
resolved_init_device = hf_get_init_device(init_device)
# We need to have all non-zero local ranks be not-pretrained
# Rank 0 will still be pretrained, and distribute the weights appropriately
if dist.get_local_rank() != 0 and init_device == 'mixed':
om_model_config.pretrained = False
if resolved_init_device == 'cpu':
if om_model_config.pretrained:
model = AutoModelForCausalLM.from_pretrained(
om_model_config.pretrained_model_name_or_path,
config=config)
else:
model = AutoModelForCausalLM.from_config(config)
elif resolved_init_device == 'meta':
if om_model_config.pretrained:
raise ValueError(
'Setting cfg.pretrained=True is not supported when init_device="meta".'
)
with init_empty_weights(include_buffers=False):
model = AutoModelForCausalLM.from_config(config)
else:
raise ValueError(
f'init_device="{init_device}" must be either "cpu" or "meta".')
# Convert the Causal LM into a Prefix LM via our custom wrapper
model = convert_hf_causal_lm_to_prefix_lm(model)
metrics = [
LanguageCrossEntropy(ignore_index=_HF_IGNORE_INDEX),
MaskedAccuracy(ignore_index=_HF_IGNORE_INDEX)
]
composer_model = super().__init__(model=model,
shift_labels=True,
tokenizer=tokenizer,
metrics=metrics,
z_loss=om_model_config.get(
'z_loss', 0.0),
init_device=init_device)
return composer_model
def forward(self, batch: MutableMapping):
# Add bidirectional_mask if it is missing and can be constructed
add_bidirectional_mask_if_missing(batch)
return super().forward(batch)