import gradio as gr from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils import torch device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') print(f"Using device: {device}") model_id="eltorio/IDEFICS3_ROCO" # model = AutoModelForImageTextToText.from_pretrained(model_id).to(device) base_model_path="HuggingFaceM4/Idefics3-8B-Llama3" #or change to local path processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True) model = Idefics3ForConditionalGeneration.from_pretrained( base_model_path, torch_dtype=torch.bfloat16 ).to(device) model.load_adapter(model_id,device_map="auto") def infere(image): messages = [ { "role": "system", "content": [ {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."}, ] }, { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": "What do we see in this image?"}, ] }, ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") # print(f"inputs: {inputs}") inputs = {k: v.to(device) for k, v in inputs.items()} generated_ids = model.generate(**inputs, max_new_tokens=100) generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) return generated_texts radiotest = gr.Interface(fn=infere, inputs="image", outputs="text") radiotest.launch()