|
|
|
|
|
|
|
import os |
|
import tempfile |
|
from argparse import ArgumentParser, Namespace |
|
from pathlib import Path |
|
from typing import Optional, Tuple, Union |
|
|
|
import torch |
|
import transformers |
|
from composer.models.huggingface import get_hf_config_from_composer_state_dict |
|
from composer.utils import (get_file, maybe_create_object_store_from_uri, |
|
parse_uri, safe_torch_load) |
|
from transformers import PretrainedConfig, PreTrainedTokenizerBase |
|
|
|
from llmfoundry import MPTConfig, MPTForCausalLM |
|
from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict |
|
from llmfoundry.utils.huggingface_hub_utils import \ |
|
edit_files_for_hf_compatibility |
|
|
|
|
|
def write_huggingface_pretrained_from_composer_checkpoint( |
|
checkpoint_path: Union[Path, str], |
|
output_path: Union[Path, str], |
|
output_precision: str = 'fp32', |
|
local_checkpoint_save_location: Optional[Union[Path, str]] = None |
|
) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]: |
|
"""Convert a Composer checkpoint to a pretrained HF checkpoint folder. |
|
|
|
Write a ``config.json`` and ``pytorch_model.bin``, like |
|
:meth:`transformers.PreTrainedModel.from_pretrained` expects, from a |
|
composer checkpoint. |
|
|
|
.. note:: This function will not work properly if you used surgery algorithms when you trained your model. In that case you will want to |
|
load the model weights using the Composer :class:`~composer.Trainer` with the ``load_path`` argument. |
|
.. testsetup:: |
|
import torch |
|
dataset = RandomTextClassificationDataset(size=16, use_keys=True) |
|
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8) |
|
eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=8) |
|
import transformers |
|
from composer.models import HuggingFaceModel |
|
from composer.trainer import Trainer |
|
hf_model = transformers.AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=2) |
|
hf_tokenizer = transformers.AutoTokenizer.from_pretrained('prajjwal1/bert-tiny') |
|
composer_model = HuggingFaceModel(hf_model, tokenizer=hf_tokenizer, metrics=[], use_logits=True) |
|
trainer = Trainer(model=composer_model, |
|
train_dataloader=train_dataloader, |
|
save_filename='composer-hf-checkpoint.pt', |
|
max_duration='1ep', |
|
save_folder='./') |
|
trainer.fit() |
|
trainer.close() |
|
|
|
Example: |
|
.. testcode:: |
|
from composer.models import write_huggingface_pretrained_from_composer_checkpoint |
|
write_huggingface_pretrained_from_composer_checkpoint('composer-hf-checkpoint.pt', './hf-save-pretrained-output') |
|
loaded_model = transformers.AutoModelForSequenceClassification.from_pretrained('./hf-save-pretrained-output') |
|
|
|
Args: |
|
checkpoint_path (Union[Path, str]): Path to the composer checkpoint, can be a local path, or a remote path beginning with ``s3://``, or another backend |
|
supported by :meth:`composer.utils.maybe_create_object_store_from_uri`. |
|
output_path (Union[Path, str]): Path to the folder to write the output to. |
|
output_precision (str, optional): The precision of the output weights saved to `pytorch_model.bin`. Can be one of ``fp32``, ``fp16``, or ``bf16``. |
|
local_checkpoint_save_location (Optional[Union[Path, str]], optional): If specified, where to save the checkpoint file to locally. |
|
If the input ``checkpoint_path`` is already a local path, this will be a symlink. |
|
Defaults to None, which will use a temporary file. |
|
""" |
|
dtype = { |
|
'fp32': torch.float32, |
|
'fp16': torch.float16, |
|
'bf16': torch.bfloat16, |
|
}[output_precision] |
|
|
|
|
|
if local_checkpoint_save_location is None: |
|
tmp_dir = tempfile.TemporaryDirectory() |
|
local_checkpoint_save_location = Path( |
|
tmp_dir.name) / 'local-composer-checkpoint.pt' |
|
|
|
|
|
os.makedirs(output_path) |
|
|
|
|
|
print( |
|
f'Downloading checkpoint from {checkpoint_path} -> {local_checkpoint_save_location}' |
|
) |
|
get_file(str(checkpoint_path), str(local_checkpoint_save_location)) |
|
|
|
|
|
print('Loading checkpoint into CPU RAM...') |
|
composer_state_dict = safe_torch_load(local_checkpoint_save_location) |
|
|
|
if 'state' not in composer_state_dict: |
|
raise RuntimeError( |
|
f'"state" is not an available key in the provided composer checkpoint. Is {local_checkpoint_save_location} ill-formed?' |
|
) |
|
|
|
|
|
print('#' * 30) |
|
print('Saving HF Model Config...') |
|
hf_config = get_hf_config_from_composer_state_dict(composer_state_dict) |
|
hf_config.torch_dtype = dtype |
|
hf_config.save_pretrained(output_path) |
|
print(hf_config) |
|
|
|
|
|
print('#' * 30) |
|
print('Saving HF Tokenizer...') |
|
hf_tokenizer = get_hf_tokenizer_from_composer_state_dict( |
|
composer_state_dict) |
|
if hf_tokenizer is not None: |
|
hf_tokenizer.save_pretrained(output_path) |
|
print(hf_tokenizer) |
|
else: |
|
print('Warning! No HF Tokenizer found!') |
|
|
|
|
|
print('#' * 30) |
|
print('Saving HF Model Weights...') |
|
weights_state_dict = composer_state_dict |
|
if 'state' in weights_state_dict: |
|
weights_state_dict = weights_state_dict['state']['model'] |
|
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present( |
|
weights_state_dict, prefix='model.') |
|
|
|
|
|
for k, v in weights_state_dict.items(): |
|
if isinstance(v, torch.Tensor): |
|
weights_state_dict[k] = v.to(dtype=dtype) |
|
|
|
|
|
torch.save(weights_state_dict, Path(output_path) / 'pytorch_model.bin') |
|
|
|
print('#' * 30) |
|
print(f'HF checkpoint folder successfully created at {output_path}.') |
|
|
|
return hf_config, hf_tokenizer |
|
|
|
|
|
def parse_args() -> Namespace: |
|
"""Parse commandline arguments.""" |
|
parser = ArgumentParser( |
|
description= |
|
'Convert a HuggingFace causal LM in a Composer checkpoint into a standard HuggingFace checkpoint folder, and optionally upload to the hub.' |
|
) |
|
parser.add_argument('--composer_path', type=str, required=True) |
|
parser.add_argument('--hf_output_path', type=str, required=True) |
|
parser.add_argument('--local_checkpoint_save_location', |
|
type=str, |
|
default=None) |
|
parser.add_argument('--output_precision', |
|
type=str, |
|
choices=['fp32', 'fp16', 'bf16'], |
|
default='fp32') |
|
parser.add_argument('--hf_repo_for_upload', type=str, default=None) |
|
parser.add_argument('--test_uploaded_model', action='store_true') |
|
|
|
return parser.parse_args() |
|
|
|
|
|
def convert_composer_to_hf(args: Namespace) -> None: |
|
print() |
|
print('#' * 30) |
|
print('Converting Composer checkpoint to HuggingFace checkpoint format...') |
|
|
|
|
|
|
|
|
|
from transformers.models.auto.configuration_auto import CONFIG_MAPPING |
|
CONFIG_MAPPING._extra_content['mpt'] = MPTConfig |
|
MPTConfig.register_for_auto_class() |
|
MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM') |
|
|
|
_, _, local_folder_path = parse_uri(args.hf_output_path) |
|
|
|
config, tokenizer = write_huggingface_pretrained_from_composer_checkpoint( |
|
checkpoint_path=args.composer_path, |
|
output_path=local_folder_path, |
|
output_precision=args.output_precision, |
|
local_checkpoint_save_location=args.local_checkpoint_save_location) |
|
|
|
dtype = { |
|
'fp32': torch.float32, |
|
'fp16': torch.float16, |
|
'bf16': torch.bfloat16, |
|
}[args.output_precision] |
|
|
|
print(f'Loading model from {local_folder_path}') |
|
if config.model_type == 'mpt': |
|
config.attn_config['attn_impl'] = 'torch' |
|
config.init_device = 'cpu' |
|
|
|
if config.model_type == 'mpt': |
|
loaded_hf_model = MPTForCausalLM.from_pretrained(local_folder_path, |
|
config=config, |
|
torch_dtype=dtype) |
|
else: |
|
loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained( |
|
local_folder_path, config=config, torch_dtype=dtype) |
|
|
|
delattr(loaded_hf_model.config, '_name_or_path') |
|
|
|
loaded_hf_model.save_pretrained(local_folder_path) |
|
|
|
print(f'Loading tokenizer from {local_folder_path}') |
|
tokenizer = transformers.AutoTokenizer.from_pretrained(local_folder_path) |
|
tokenizer.save_pretrained(local_folder_path) |
|
|
|
|
|
if config.model_type == 'mpt': |
|
print('Editing files for HF compatibility...') |
|
edit_files_for_hf_compatibility(local_folder_path) |
|
|
|
object_store = maybe_create_object_store_from_uri(str(args.hf_output_path)) |
|
|
|
if object_store is not None: |
|
print( |
|
f'Uploading HF checkpoint folder from {local_folder_path} -> {args.hf_output_path}' |
|
) |
|
for file in os.listdir(local_folder_path): |
|
remote_file = os.path.join(local_folder_path, file) |
|
local_file = os.path.join(local_folder_path, file) |
|
object_store.upload_object(remote_file, local_file) |
|
|
|
if args.hf_repo_for_upload is not None: |
|
from huggingface_hub import HfApi |
|
api = HfApi() |
|
|
|
print( |
|
f'Uploading {args.hf_output_path} to HuggingFace Hub at {args.hf_repo_for_upload}' |
|
) |
|
api.create_repo(repo_id=args.hf_repo_for_upload, |
|
use_auth_token=True, |
|
repo_type='model', |
|
private=True, |
|
exist_ok=True) |
|
print('Repo created.') |
|
|
|
|
|
ignore_patterns = [] |
|
if any( |
|
f.startswith('pytorch_model-00001') |
|
for f in os.listdir(args.hf_output_path)): |
|
ignore_patterns.append('pytorch_model.bin') |
|
|
|
api.upload_folder(folder_path=args.hf_output_path, |
|
repo_id=args.hf_repo_for_upload, |
|
use_auth_token=True, |
|
repo_type='model', |
|
ignore_patterns=ignore_patterns) |
|
print('Folder uploaded.') |
|
|
|
if args.test_uploaded_model: |
|
print('Testing uploaded model...') |
|
hub_model = transformers.AutoModelForCausalLM.from_pretrained( |
|
args.hf_repo_for_upload, |
|
trust_remote_code=True, |
|
use_auth_token=True, |
|
torch_dtype=dtype) |
|
hub_tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
args.hf_repo_for_upload, |
|
trust_remote_code=True, |
|
use_auth_token=True) |
|
|
|
assert sum(p.numel() for p in hub_model.parameters()) == sum( |
|
p.numel() for p in loaded_hf_model.parameters()) |
|
assert all( |
|
str(type(module1)).split('.')[-2:] == str(type(module2)).split( |
|
'.')[-2:] for module1, module2 in zip( |
|
hub_model.modules(), loaded_hf_model.modules())) |
|
|
|
assert next( |
|
hub_model.parameters() |
|
).dtype == dtype, f'Expected model dtype to be {dtype}, but got {next(hub_model.parameters()).dtype}' |
|
print( |
|
hub_tokenizer.batch_decode( |
|
hub_model.generate(hub_tokenizer( |
|
'MosaicML is', return_tensors='pt').input_ids, |
|
max_new_tokens=10))) |
|
|
|
print( |
|
'Composer checkpoint successfully converted to HuggingFace checkpoint format.' |
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
convert_composer_to_hf(parse_args()) |
|
|