Sample Script Here

#6
by ctranslate2-4you - opened

I hate it when repo owners don't give detailed examples so here you go people...The vision capabilities were pretty good actually:

from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor, BitsAndBytesConfig
import torch
from PIL import Image
import warnings

def process_image(image_path):
    model_id = r"[PATH TO LOCAL DIRECTOR ON COMPUTER OR THE REPOSITORY ID NOT IN A RAW STRING OBVIOUSLY"
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    # instantiate model
    model = LlavaNextForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        device_map="auto",
        trust_remote_code=True
    )

    # instantiate processor
    processor = LlavaNextProcessor.from_pretrained(model_id, tokenizer_class='PreTrainedTokenizerFast', trust_remote_code=True)

    image = Image.open(image_path)
    instruction = "Describe this image in detail as possible but be succinct and don't repeat yourself."
    prompt = f"User:<image>\n{instruction} Falcon:"
    inputs = processor(text=prompt, images=image, return_tensors="pt", padding=True).to("cuda:0")

    output = model.generate(**inputs, max_new_tokens=512)

    prompt_length = inputs['input_ids'].shape[1]
    model_response = processor.decode(output[0][prompt_length:], skip_special_tokens=True).strip()

    print(f"\n{model_response}\n")

if __name__ == "__main__":
    input_image_path = r"[PATH TO A LOCAL FILE ON YOUR COMPUTER]"
    process_image(input_image_path)

Could you please provide information on which version of Python and what requirements are needed? Could you also upload the details?
Thanks Mirosalv

Your need to confirm your account before you can post a new comment.

Sign up or log in to comment