CRYSTAL-Mac / Perceptrix /finetune /build /lib /inference /convert_composer_to_hf.py
crystal-technologies's picture
Upload 303 files
de4ade4
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
import os
import tempfile
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Optional, Tuple, Union
import torch
import transformers
from composer.models.huggingface import get_hf_config_from_composer_state_dict
from composer.utils import (get_file, maybe_create_object_store_from_uri,
parse_uri, safe_torch_load)
from transformers import PretrainedConfig, PreTrainedTokenizerBase
from llmfoundry import MPTConfig, MPTForCausalLM
from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict
from llmfoundry.utils.huggingface_hub_utils import \
edit_files_for_hf_compatibility
def write_huggingface_pretrained_from_composer_checkpoint(
checkpoint_path: Union[Path, str],
output_path: Union[Path, str],
output_precision: str = 'fp32',
local_checkpoint_save_location: Optional[Union[Path, str]] = None
) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]:
"""Convert a Composer checkpoint to a pretrained HF checkpoint folder.
Write a ``config.json`` and ``pytorch_model.bin``, like
:meth:`transformers.PreTrainedModel.from_pretrained` expects, from a
composer checkpoint.
.. note:: This function will not work properly if you used surgery algorithms when you trained your model. In that case you will want to
load the model weights using the Composer :class:`~composer.Trainer` with the ``load_path`` argument.
.. testsetup::
import torch
dataset = RandomTextClassificationDataset(size=16, use_keys=True)
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
import transformers
from composer.models import HuggingFaceModel
from composer.trainer import Trainer
hf_model = transformers.AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2)
hf_tokenizer = transformers.AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
composer_model = HuggingFaceModel(hf_model, tokenizer=hf_tokenizer, metrics=[], use_logits=True)
trainer = Trainer(model=composer_model,
train_dataloader=train_dataloader,
save_filename='composer-hf-checkpoint.pt',
max_duration='1ep',
save_folder='./')
trainer.fit()
trainer.close()
Example:
.. testcode::
from composer.models import write_huggingface_pretrained_from_composer_checkpoint
write_huggingface_pretrained_from_composer_checkpoint('composer-hf-checkpoint.pt', './hf-save-pretrained-output')
loaded_model = transformers.AutoModelForSequenceClassification.from_pretrained('./hf-save-pretrained-output')
Args:
checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend
supported by :meth:`composer.utils.maybe_create_object_store_from_uri`.
output_path (Union[Path, str]): Path to the folder to write the output to.
output_precision (str, optional): The precision of the output weights saved to `pytorch_model.bin`. Can be one of ``fp32``, ``fp16``, or ``bf16``.
local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally.
If the input ``checkpoint_path`` is already a local path, this will be a symlink.
Defaults to None, which will use a temporary file.
"""
dtype = {
'fp32': torch.float32,
'fp16': torch.float16,
'bf16': torch.bfloat16,
}[output_precision]
# default local path to a tempfile if path is not provided
if local_checkpoint_save_location is None:
tmp_dir = tempfile.TemporaryDirectory()
local_checkpoint_save_location = Path(
tmp_dir.name) / 'local-composer-checkpoint.pt'
# create folder
os.makedirs(output_path)
# download the checkpoint file
print(
f'Downloading checkpoint from {checkpoint_path} -> {local_checkpoint_save_location}'
)
get_file(str(checkpoint_path), str(local_checkpoint_save_location))
# Load the Composer checkpoint state dict
print('Loading checkpoint into CPU RAM...')
composer_state_dict = safe_torch_load(local_checkpoint_save_location)
if 'state' not in composer_state_dict:
raise RuntimeError(
f'"state" is not an available key in the provided composer checkpoint. Is {local_checkpoint_save_location} ill-formed?'
)
# Build and save HF Config
print('#' * 30)
print('Saving HF Model Config...')
hf_config = get_hf_config_from_composer_state_dict(composer_state_dict)
hf_config.torch_dtype = dtype
hf_config.save_pretrained(output_path)
print(hf_config)
# Extract and save the HF tokenizer
print('#' * 30)
print('Saving HF Tokenizer...')
hf_tokenizer = get_hf_tokenizer_from_composer_state_dict(
composer_state_dict)
if hf_tokenizer is not None:
hf_tokenizer.save_pretrained(output_path)
print(hf_tokenizer)
else:
print('Warning! No HF Tokenizer found!')
# Extract the HF model weights
print('#' * 30)
print('Saving HF Model Weights...')
weights_state_dict = composer_state_dict
if 'state' in weights_state_dict:
weights_state_dict = weights_state_dict['state']['model']
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(
weights_state_dict, prefix='model.')
# Convert weights to desired dtype
for k, v in weights_state_dict.items():
if isinstance(v, torch.Tensor):
weights_state_dict[k] = v.to(dtype=dtype)
# Save weights
torch.save(weights_state_dict, Path(output_path) / 'pytorch_model.bin')
print('#' * 30)
print(f'HF checkpoint folder successfully created at {output_path}.')
return hf_config, hf_tokenizer
def parse_args() -> Namespace:
"""Parse commandline arguments."""
parser = ArgumentParser(
description=
'Convert a HuggingFace causal LM in a Composer checkpoint into a standard HuggingFace checkpoint folder, and optionally upload to the hub.'
)
parser.add_argument('--composer_path', type=str, required=True)
parser.add_argument('--hf_output_path', type=str, required=True)
parser.add_argument('--local_checkpoint_save_location',
type=str,
default=None)
parser.add_argument('--output_precision',
type=str,
choices=['fp32', 'fp16', 'bf16'],
default='fp32')
parser.add_argument('--hf_repo_for_upload', type=str, default=None)
parser.add_argument('--test_uploaded_model', action='store_true')
return parser.parse_args()
def convert_composer_to_hf(args: Namespace) -> None:
print()
print('#' * 30)
print('Converting Composer checkpoint to HuggingFace checkpoint format...')
# Register MPT auto classes so that this script works with MPT
# This script will not work without modification for other custom models,
# but will work for other HuggingFace causal LMs
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
CONFIG_MAPPING._extra_content['mpt'] = MPTConfig
MPTConfig.register_for_auto_class()
MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM')
_, _, local_folder_path = parse_uri(args.hf_output_path)
config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint(
checkpoint_path=args.composer_path,
output_path=local_folder_path,
output_precision=args.output_precision,
local_checkpoint_save_location=args.local_checkpoint_save_location)
dtype = {
'fp32': torch.float32,
'fp16': torch.float16,
'bf16': torch.bfloat16,
}[args.output_precision]
print(f'Loading model from {local_folder_path}')
if config.model_type == 'mpt':
config.attn_config['attn_impl'] = 'torch'
config.init_device = 'cpu'
if config.model_type == 'mpt':
loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path,
config=config,
torch_dtype=dtype)
else:
loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(
local_folder_path, config=config, torch_dtype=dtype)
delattr(loaded_hf_model.config, '_name_or_path')
loaded_hf_model.save_pretrained(local_folder_path)
print(f'Loading tokenizer from {local_folder_path}')
tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path)
tokenizer.save_pretrained(local_folder_path)
# Only need to edit files for MPT because it has custom code
if config.model_type == 'mpt':
print('Editing files for HF compatibility...')
edit_files_for_hf_compatibility(local_folder_path)
object_store = maybe_create_object_store_from_uri(str(args.hf_output_path))
if object_store is not None:
print(
f'Uploading HF checkpoint folder from {local_folder_path} -> {args.hf_output_path}'
)
for file in os.listdir(local_folder_path):
remote_file = os.path.join(local_folder_path, file)
local_file = os.path.join(local_folder_path, file)
object_store.upload_object(remote_file, local_file)
if args.hf_repo_for_upload is not None:
from huggingface_hub import HfApi
api = HfApi()
print(
f'Uploading {args.hf_output_path} to HuggingFace Hub at {args.hf_repo_for_upload}'
)
api.create_repo(repo_id=args.hf_repo_for_upload,
use_auth_token=True,
repo_type='model',
private=True,
exist_ok=True)
print('Repo created.')
# ignore the full checkpoint file if we now have sharded checkpoint files
ignore_patterns = []
if any(
f.startswith('pytorch_model-00001')
for f in os.listdir(args.hf_output_path)):
ignore_patterns.append('pytorch_model.bin')
api.upload_folder(folder_path=args.hf_output_path,
repo_id=args.hf_repo_for_upload,
use_auth_token=True,
repo_type='model',
ignore_patterns=ignore_patterns)
print('Folder uploaded.')
if args.test_uploaded_model:
print('Testing uploaded model...')
hub_model = transformers.AutoModelForCausalLM.from_pretrained(
args.hf_repo_for_upload,
trust_remote_code=True,
use_auth_token=True,
torch_dtype=dtype)
hub_tokenizer = transformers.AutoTokenizer.from_pretrained(
args.hf_repo_for_upload,
trust_remote_code=True,
use_auth_token=True)
assert sum(p.numel() for p in hub_model.parameters()) == sum(
p.numel() for p in loaded_hf_model.parameters())
assert all(
str(type(module1)).split('.')[-2:] == str(type(module2)).split(
'.')[-2:] for module1, module2 in zip(
hub_model.modules(), loaded_hf_model.modules()))
assert next(
hub_model.parameters()
).dtype == dtype, f'Expected model dtype to be {dtype}, but got {next(hub_model.parameters()).dtype}'
print(
hub_tokenizer.batch_decode(
hub_model.generate(hub_tokenizer(
'MosaicML is', return_tensors='pt').input_ids,
max_new_tokens=10)))
print(
'Composer checkpoint successfully converted to HuggingFace checkpoint format.'
)
if __name__ == '__main__':
convert_composer_to_hf(parse_args())