Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,691 Bytes
7385f22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# Copyright 2023 Haotian Liu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ------------------------------------------------------------------------
# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
# Copyright 2024 Yanwei Li
# ------------------------------------------------------------------------
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
from model import *
from constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
kwargs = {"device_map": device_map, **kwargs}
if device != "cuda":
kwargs['device_map'] = {"": device}
if load_8bit:
kwargs['load_in_8bit'] = True
elif load_4bit:
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
)
else:
kwargs['torch_dtype'] = torch.float16
if use_flash_attn:
kwargs['attn_implementation'] = 'flash_attention_2'
# import pdb;pdb.set_trace()
if 'mini-gemini' in model_name.lower():
# Load MiniGemini model
if model_base is not None:
# this may be mm projector only
print('Loading MiniGemini from base model...')
if "8x7b" in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_base)
model = MiniGeminiMixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
elif "2b" in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_base)
model = MiniGeminiGemmaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
else:
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
model = MiniGeminiLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
model.load_state_dict(mm_projector_weights, strict=False)
else:
if "8x7b" in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MiniGeminiMixtralForCausalLM.from_pretrained(model_path, **kwargs)
elif "2b" in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MiniGeminiGemmaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
else:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = MiniGeminiLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
if 'gemma' in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MiniGeminiGemmaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
elif 'vicuna' in model_name.lower() or 'unitok' in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = MiniGeminiLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
else:
# Load language model
if model_base is not None:
# PEFT model
from peft import PeftModel
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
print(f"Loading LoRA weights from {model_path}")
model = PeftModel.from_pretrained(model, model_path)
print(f"Merging weights")
model = model.merge_and_unload()
print('Convert to FP16...')
model.to(torch.float16)
else:
if 'mpt' in model_name.lower():
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
else:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
image_processor = None
# import pdb;pdb.set_trace()
# mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
# mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
# if mm_use_im_patch_token:
# tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
# if mm_use_im_start_end:
# tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
# model.resize_token_embeddings(len(tokenizer))
# vision_tower = model.get_vision_tower()
# if not vision_tower.is_loaded:
# vision_tower.load_model()
# vision_tower.to(device=device, dtype=torch.float16)
# image_processor = vision_tower.image_processor
if 'mini-gemini' in model_name.lower():
vision_tower_aux = model.get_vision_tower_aux()
if not vision_tower_aux.is_loaded:
vision_tower_aux.load_model()
vision_tower_aux.to(device=device, dtype=torch.float16)
# initialize attention modules
model.config.model_path = model_path
model.get_model().initialize_uni_modules(model.config, for_eval=True)
if hasattr(model.config, "max_sequence_length"):
context_len = model.config.max_sequence_length
else:
context_len = 2048
return tokenizer, model, image_processor, context_len |