crystal-technologies
/

CRYSTAL-Mac

Model card Files Files and versions Community

CRYSTAL-Mac / Perceptrix /finetune /llmfoundry /models /hf /hf_prefix_lm.py

crystal-technologies

Upload 303 files

de4ade4 over 1 year ago

raw

history blame contribute delete

7.16 kB

	# Copyright 2022 MosaicML LLM Foundry authors
	# SPDX-License-Identifier: Apache-2.0

	"""Implements a Hugging Prefix LM wrapped inside a :class:`.ComposerModel`."""

	from __future__ import annotations

	from typing import Mapping, MutableMapping

	from composer.metrics.nlp import LanguageCrossEntropy, MaskedAccuracy
	from composer.utils import dist
	from omegaconf import DictConfig
	from transformers import (AutoConfig, AutoModelForCausalLM,
	PreTrainedTokenizerBase)

	from llmfoundry.models.hf.hf_fsdp import hf_get_init_device
	from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithZLoss
	from llmfoundry.models.utils import (adapt_tokenizer_for_denoising,
	add_bidirectional_mask_if_missing,
	convert_hf_causal_lm_to_prefix_lm,
	init_empty_weights)

	__all__ = ['ComposerHFPrefixLM']

	# HuggingFace hardcodes the ignore index to -100
	_HF_IGNORE_INDEX = -100


	class ComposerHFPrefixLM(HuggingFaceModelWithZLoss):
	"""Configures a :class:`.HuggingFaceModel` around a Prefix LM.

	Note: HuggingFace does not natively support Prefix LM-style models. This function uses
	`transformers.AutoModelForCausalLM` to instantiate a Causal LM, then uses a conversion utility
	to turn the model into a Prefix LM. Currently, that conversion utility only supports the
	following HuggingFace Causal LM types:
	- `GPT2LMHeadModel`
	- `GPTNeoForCausalLM`
	- `GPTNeoXForCausalLM`
	- `GPTJForCausalLM`
	- `BloomForCausalLM`
	- `OPTForCausalLM`

	Args:
	cfg (DictConfig): An omegaconf dictionary used to configure the model:
	cfg.pretrained_model_name_or_path (str): The name of or local path to
	the HF model (e.g., `gpt2` to instantiate a GPT2LMHeadModel). The model
	will be converted to a Prefix LM during initialization.
	cfg.config_overrides (dict, optional): An optional dictionary of keyword
	arguments that override the default configuration associated with
	cfg.pretrained_model_name_or_path. Default: ``{}``.
	cfg.pretrained (bool): Whether to instantiate the model with pre-trained
	weights coming from cfg.pretrained_model_name_or_path. If ``True``,
	cfg.config_overrides must be compatible with the pre-trained weights.
	cfg.init_device ('cpu' \| 'meta'): Which device, 'cpu' or 'meta', to
	initialize the model on. Currently, `meta` is only supported when
	cfg.pretrained is ``False``. Default: ``'cpu'``.
	cfg.z_loss (float, optional): The coefficient of the z-loss. If >0.0, this
	the z-loss will be multiplied by this value before being added to the
	standard loss term. Default: ``0.0``.
	cfg.adapt_vocab_for_denoising (bool, optional): Whether to adapt the vocab
	of the model/tokenizer to include sentinel tokens that are used in denoising
	tasks like Span Corruption. If you intend to load from an existing Composer
	checkpoint that was trained on such a task, set this to ``True`` to ensure
	that the model vocab size matches your checkpoint's vocab size when loading
	the weights. Default: ``False``.
	tokenizer (PreTrainedTokenizer): The tokenizer that the model will use.
	"""

	def __init__(self, om_model_config: DictConfig,
	tokenizer: PreTrainedTokenizerBase):
	config = AutoConfig.from_pretrained(
	om_model_config.pretrained_model_name_or_path,
	trust_remote_code=om_model_config.get('trust_remote_code', True),
	use_auth_token=om_model_config.get('use_auth_token', False),
	)

	# set config overrides
	for k, v in om_model_config.get('config_overrides', {}).items():
	if not hasattr(config, k):
	raise ValueError(
	f'config does not have attribute "{k}" to override ({k}: {v}).'
	)

	attr = getattr(config, k)
	if isinstance(attr, Mapping):
	extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
	if extra_keys:
	raise ValueError(
	f'Config dict override got unknown keys. ' +
	f'Extra keys: {extra_keys}. ' +
	f'Expected (a subset of) keys: {list(attr.keys())}.')
	getattr(config, k).update(v)
	else:
	setattr(config, k, v)

	# Set up the tokenizer (add tokens for denoising sentinels if needed)
	if om_model_config.get('adapt_vocab_for_denoising', False):
	adapt_tokenizer_for_denoising(tokenizer)

	init_device = om_model_config.get('init_device', 'cpu')

	# Get the device we want to initialize, and use the
	# resolved version to initialize the HF model
	resolved_init_device = hf_get_init_device(init_device)

	# We need to have all non-zero local ranks be not-pretrained
	# Rank 0 will still be pretrained, and distribute the weights appropriately
	if dist.get_local_rank() != 0 and init_device == 'mixed':
	om_model_config.pretrained = False

	if resolved_init_device == 'cpu':
	if om_model_config.pretrained:
	model = AutoModelForCausalLM.from_pretrained(
	om_model_config.pretrained_model_name_or_path,
	config=config)
	else:
	model = AutoModelForCausalLM.from_config(config)
	elif resolved_init_device == 'meta':
	if om_model_config.pretrained:
	raise ValueError(
	'Setting cfg.pretrained=True is not supported when init_device="meta".'
	)
	with init_empty_weights(include_buffers=False):
	model = AutoModelForCausalLM.from_config(config)
	else:
	raise ValueError(
	f'init_device="{init_device}" must be either "cpu" or "meta".')

	# Convert the Causal LM into a Prefix LM via our custom wrapper
	model = convert_hf_causal_lm_to_prefix_lm(model)

	metrics = [
	LanguageCrossEntropy(ignore_index=_HF_IGNORE_INDEX),
	MaskedAccuracy(ignore_index=_HF_IGNORE_INDEX)
	]

	composer_model = super().__init__(model=model,
	shift_labels=True,
	tokenizer=tokenizer,
	metrics=metrics,
	z_loss=om_model_config.get(
	'z_loss', 0.0),
	init_device=init_device)

	return composer_model

	def forward(self, batch: MutableMapping):
	# Add bidirectional_mask if it is missing and can be constructed
	add_bidirectional_mask_if_missing(batch)
	return super().forward(batch)