OMG-LLaVA / xtuner /engine /hooks /hf_checkpoint_hook.py
zhangtao-whu's picture
Upload folder using huggingface_hub
476ac07 verified
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
from pathlib import Path
from typing import Optional, Union
import torch.distributed as dist
from mmengine import print_log
from mmengine._strategy import DeepSpeedStrategy
from mmengine.hooks import Hook
from mmengine.model import is_model_wrapper
from mmengine.runner import FlexibleRunner
from xtuner.registry import BUILDER
from xtuner.utils import get_origin_state_dict
DATA_BATCH = Optional[Union[dict, tuple, list]]
class HFCheckpointHook(Hook):
priority = 95 # lower than CheckpointHook in MMEngine
def __init__(self, out_dir: Optional[Union[str, Path]] = None) -> None:
self.out_dir = out_dir
@staticmethod
def _use_shard_moe(llm):
config = llm.config
moe_implementation = getattr(config, 'moe_implementation', 'origin')
return moe_implementation == 'shard'
def after_run(self, runner) -> None:
assert isinstance(runner,
FlexibleRunner), 'Runner should be `FlexibleRunner`'
assert isinstance(
runner.strategy,
DeepSpeedStrategy), 'Strategy should be `DeepSpeedStrategy`'
if self.out_dir is None:
self.out_dir = osp.join(runner.work_dir, 'hf_model')
wrapped_model = runner.strategy.model
if wrapped_model.zero_optimization_partition_weights():
assert wrapped_model.zero_gather_16bit_weights_on_model_save(), \
('Please set `gather_16bit_weights_on_model_save=True` '
'in your DeepSpeed config.')
state_dict = wrapped_model._zero3_consolidated_16bit_state_dict()
else:
state_dict = wrapped_model.module_state_dict(
exclude_frozen_parameters=runner.strategy.
exclude_frozen_parameters)
model = runner.model
if is_model_wrapper(model):
model = model.module
llm = model.llm
if (not dist.is_initialized()) or dist.get_rank() == 0:
# keys in state_dict are prefixed with 'llm.'
keys = list(state_dict.keys())
for k in keys:
val = state_dict.pop(k)
state_dict[k[4:]] = val
if self._use_shard_moe(llm):
print_log('recover the origin state_dict from merged one ...')
state_dict = get_origin_state_dict(state_dict, llm)
print_log(f'Saving LLM to {self.out_dir}')
llm.save_pretrained(self.out_dir, state_dict=state_dict)
print_log(f'Saving LLM tokenizer to {self.out_dir}')
tokenizer = BUILDER.build(runner.cfg.tokenizer)
tokenizer.save_pretrained(self.out_dir)