|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils |
|
import torch |
|
import spaces |
|
|
|
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
print(f"Using device: {device}") |
|
|
|
|
|
model_id = "eltorio/IDEFICS3_ROCO" |
|
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True) |
|
|
|
|
|
model = Idefics3ForConditionalGeneration.from_pretrained( |
|
base_model_path, torch_dtype=torch.bfloat16 |
|
).to(device) |
|
|
|
|
|
model.load_adapter(model_id, device_map="auto") |
|
|
|
|
|
@spaces.GPU |
|
def infere(image): |
|
""" |
|
Generate a description of a medical image. |
|
|
|
Args: |
|
- image (PIL Image): The medical image to describe. |
|
|
|
Returns: |
|
- generated_texts (List[str]): A list containing the generated description. |
|
""" |
|
|
|
|
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": [ |
|
{"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."}, |
|
] |
|
}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": "What do we see in this image?"}, |
|
] |
|
}, |
|
] |
|
|
|
|
|
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
|
|
|
|
inputs = processor(text=prompt, images=[image], return_tensors="pt") |
|
|
|
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=100) |
|
|
|
|
|
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
|
|
return generated_texts |
|
|
|
|
|
title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>" |
|
desc = "This model generates a description of a medical image.<br><b>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please direct your citations and likes to the original work.</b>" |
|
|
|
device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>" |
|
|
|
|
|
long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a> a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat" |
|
|
|
|
|
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, |
|
description=desc, article=long_desc) |
|
|
|
|
|
radiotest.launch() |