Usage Example

import requests
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor, BitsAndBytesConfig


def get_image_description(model, processor, image, initial_prompt='', max_new_tokens=70, *args, **kwargs):
    initial_prompt = initial_prompt if initial_prompt != '' else "How would you describe the contents of this photo?"
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": initial_prompt}
        ]}
    ]
    input_text = processor.apply_chat_template(
        messages, add_generation_prompt=True)
    inputs = processor(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(model.device)
    output = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return processor.decode(output[0])


def load_model(model_id="belkhir-nacim/l32vision_instruct"):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization
    )
    model = MllamaForConditionalGeneration.from_pretrained(
        model_id, device_map="auto",quantization_config=bnb_config)
    processor = AutoProcessor.from_pretrained(model_id)
    return model, processor


model, processor = load_model()

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)
result = get_image_description(
    model, processor, image, initial_prompt="Tell me what do you see in the image. use keywords to describe")
print(result)
Downloads last month
74
Safetensors
Model size
6.05B params
Tensor type
F32
FP16
U8
Inference Providers NEW
This model is not currently available via any of the supported third-party Inference Providers, and the HF Inference API does not support transformers models with pipeline type image-text-to-text