Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils | |
import torch | |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
model_id="eltorio/IDEFICS3_ROCO" | |
# model = AutoModelForImageTextToText.from_pretrained(model_id).to(device) | |
base_model_path="HuggingFaceM4/Idefics3-8B-Llama3" #or change to local path | |
processor = AutoProcessor.from_pretrained(base_model_path) | |
model = Idefics3ForConditionalGeneration.from_pretrained( | |
base_model_path, torch_dtype=torch.bfloat16 | |
).to(device) | |
model.load_adapter(model_id) | |
def infere(image): | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image"}, | |
{"type": "text", "text": "What do we see in this image?"}, | |
] | |
}, | |
] | |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor(text=prompt, images=[image], return_tensors="pt") | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
generated_ids = model.generate(**inputs, max_new_tokens=8192) | |
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) | |
return generated_texts | |
demo = gr.Interface(fn=infere, inputs="image", outputs="text") | |
demo.launch() |