from typing import Dict, List, Any | |
from transformers import pipeline | |
from PIL import Image | |
import requests | |
from transformers import AutoModelForCausalLM, LlamaTokenizer | |
import torch | |
from accelerate import ( | |
init_empty_weights, | |
infer_auto_device_map, | |
load_checkpoint_and_dispatch, | |
) | |
import os | |
import logging | |
from transformers import logging as hf_logging | |
logging.basicConfig(level=logging.INFO) | |
hf_logging.set_verbosity_debug() | |
def list_files(directory, depth, max_depth=5): | |
# Lists all files and directories in the given directory | |
for filename in os.listdir(directory): | |
print(os.path.join(directory, filename)) | |
if not os.path.isfile(filename) and depth < max_depth: | |
list_files(os.path.join(directory, filename), depth + 1, max_depth) | |
class EndpointHandler: | |
def __init__(self, path=""): | |
# Preload all the elements you are going to need at inference. | |
# self.pipeline = pipeline( | |
# "text-generation", model="THUDM/cogvlm-chat-hf", trust_remote_code=True | |
# ) | |
# self.model = AutoModelForCausalLM.from_pretrained( | |
# "THUDM/cogvlm-chat-hf", trust_remote_code=True | |
# ) | |
self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5") | |
self.model = ( | |
AutoModelForCausalLM.from_pretrained( | |
"THUDM/cogvlm-chat-hf", | |
torch_dtype=torch.bfloat16, | |
low_cpu_mem_usage=True, | |
trust_remote_code=True, | |
) | |
.to("cuda") | |
.eval() | |
) | |
# DISTRIBUTED GPUS | |
# with init_empty_weights(): | |
# self.model = AutoModelForCausalLM.from_pretrained( | |
# "THUDM/cogvlm-chat-hf", | |
# torch_dtype=torch.bfloat16, | |
# low_cpu_mem_usage=True, | |
# trust_remote_code=True, | |
# ) | |
# # print("LISTING FILES IN ", "/root/.cache/huggingface") | |
# # list_files("/root/.cache/huggingface", 0, 5) | |
# device_map = infer_auto_device_map( | |
# self.model, | |
# max_memory={ | |
# 0: "12GiB", | |
# 1: "12GiB", | |
# 2: "12GiB", | |
# 3: "12GiB", | |
# "cpu": "180GiB", | |
# }, | |
# no_split_module_classes=["CogVLMDecoderLayer"], | |
# ) | |
# self.model = load_checkpoint_and_dispatch( | |
# self.model, | |
# "/root/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730", | |
# device_map=device_map, | |
# no_split_module_classes=["CogVLMDecoderLayer"], | |
# ) | |
# self.model = self.model.eval() | |
## DISTRIBUTED GPUS | |
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
""" | |
data args: | |
inputs (:obj: `str` | `PIL.Image` | `np.array`) | |
kwargs | |
Return: | |
A :obj:`list` | `dict`: will be serialized and returned | |
""" | |
query = data["query"] | |
img_uri = data["img_uri"] | |
image = Image.open( | |
requests.get( | |
img_uri, | |
stream=True, | |
).raw | |
).convert("RGB") | |
inputs = self.model.build_conversation_input_ids( | |
self.tokenizer, | |
query=query, | |
history=[], | |
images=[image], | |
template_version="vqa", | |
) # vqa mode | |
inputs = { | |
"input_ids": inputs["input_ids"].unsqueeze(0).to("cuda"), | |
"token_type_ids": inputs["token_type_ids"].unsqueeze(0).to("cuda"), | |
"attention_mask": inputs["attention_mask"].unsqueeze(0).to("cuda"), | |
"images": [[inputs["images"][0].to("cuda").to(torch.bfloat16)]], | |
} | |
gen_kwargs = {"max_length": 2048, "do_sample": False} | |
with torch.no_grad(): | |
outputs = self.model.generate(**inputs, **gen_kwargs) | |
print("outputs 1: ", outputs) | |
outputs = outputs[:, inputs["input_ids"].shape[1] :] | |
print("outputs 2: ", outputs) | |
response = self.tokenizer.decode(outputs[0]) | |
return response | |
# query = "How many houses are there in this cartoon?" | |
# image = Image.open( | |
# requests.get( | |
# "https://github.com/THUDM/CogVLM/blob/main/examples/3.jpg?raw=true", stream=True | |
# ).raw | |
# ).convert("RGB") | |
# inputs = model.build_conversation_input_ids( | |
# tokenizer, query=query, history=[], images=[image], template_version="vqa" | |
# ) # vqa mode | |
# inputs = { | |
# "input_ids": inputs["input_ids"].unsqueeze(0).to("cuda"), | |
# "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to("cuda"), | |
# "attention_mask": inputs["attention_mask"].unsqueeze(0).to("cuda"), | |
# "images": [[inputs["images"][0].to("cuda").to(torch.bfloat16)]], | |
# } | |
# gen_kwargs = {"max_length": 2048, "do_sample": False} | |
# with torch.no_grad(): | |
# outputs = model.generate(**inputs, **gen_kwargs) | |
# outputs = outputs[:, inputs["input_ids"].shape[1] :] | |
# print(tokenizer.decode(outputs[0])) | |