LlavaMistral1

Runtime error

App Files Files Community

lorocksUMD commited on Dec 18, 2024

Commit

a3de5d2

verified ·

1 Parent(s): 43605d2

Create multi_script.py

Browse files

Files changed (1) hide show

multi_script.py +168 -0

multi_script.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import gradio as gr
+from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer
+from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+)
+from llava.constants import (
+    IMAGE_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IMAGE_PLACEHOLDER,
+)
+from llava.conversation import conv_templates, SeparatorStyle
+import argparse
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+import re
+parser = argparse.ArgumentParser()
+parser.add_argument("--model-path", type=str, default="liuhaotian/llava-v1.6-mistral-7b")
+parser.add_argument("--image-file", type=str, required=True)
+parser.add_argument("--inference-type", type=str, default="auto")
+parser.add_argument("--prompt", type=str, default="Explain this image")
+cmd_args = parser.parse_args()
+# Line 138 uncomment the cuda() to use GPUs
+# device = "cpu"
+device = cmd_args.inference_type
+prompt = cmd_args.prompt
+image_file = cmd_args.image_file
+model_path = cmd_args.model_path
+# Functions for inference
+def image_parser(args):
+    out = args.image_file.split(args.sep)
+    return out
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+model_name = get_model_name_from_path('llava-v1.6-mistral-7b')
+args = type('Args', (), {
+    "model_path": model_path,
+    "model_base": None,
+    "model_name": model_name,
+    "query": prompt,
+    "conv_mode": None,
+    "image_file": image_file,
+    "sep": ",",
+    "temperature": 0,
+    "top_p": None,
+    "num_beams": 1,
+    "max_new_tokens": 512
+})()
+tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path, None, model_name, device_map=device
+    )
+qs = args.query
+image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+if IMAGE_PLACEHOLDER in qs:
+    if model.config.mm_use_im_start_end:
+        qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
+    else:
+        qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
+else:
+    if model.config.mm_use_im_start_end:
+        qs = image_token_se + "\n" + qs
+    else:
+        qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+if "llama-2" in model_name.lower():
+    conv_mode = "llava_llama_2"
+elif "mistral" in model_name.lower():
+    conv_mode = "mistral_instruct"
+elif "v1.6-34b" in model_name.lower():
+    conv_mode = "chatml_direct"
+elif "v1" in model_name.lower():
+    conv_mode = "llava_v1"
+elif "mpt" in model_name.lower():
+    conv_mode = "mpt"
+else:
+    conv_mode = "llava_v0"
+if args.conv_mode is not None and conv_mode != args.conv_mode:
+    print(
+        "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+            conv_mode, args.conv_mode, args.conv_mode
+        )
+    )
+else:
+    args.conv_mode = conv_mode
+conv = conv_templates[args.conv_mode].copy()
+conv.append_message(conv.roles[0], qs)
+conv.append_message(conv.roles[1], None)
+prompt = conv.get_prompt()
+image_files = image_parser(args)
+images = load_images(image_files)
+image_sizes = [x.size for x in images]
+images_tensor = process_images(
+    images,
+    image_processor,
+    model.config
+).to(model.device, dtype=torch.float16)
+input_ids = (
+    tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+    .unsqueeze(0)
+    # .cuda()
+)
+with torch.inference_mode():
+    output_ids = model.generate(
+        input_ids,
+        images=images_tensor,
+        image_sizes=image_sizes,
+        do_sample=True if args.temperature > 0 else False,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        num_beams=args.num_beams,
+        max_new_tokens=args.max_new_tokens,
+        use_cache=True,
+    )
+outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+if "dataset1" in image_file:
+    print("Num of words: ", len(outputs))
+elif "dataset2" in image_file:
+    print()
+else:
+    print("Is single word?", len((outputs).split()) == 1)
+print(outputs)
+# End Llava inference