# Copyright 2024 Ronan Le Meillat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Import necessary libraries
import gradio as gr
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils
import torch
import spaces
# Determine the device (GPU or CPU) to run the model on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}") # Log the device being used
# Define the model ID and base model path
model_id = "eltorio/IDEFICS3_ROCO"
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" # or change to local path
# Initialize the processor from the base model path
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)
# Initialize the model from the base model path and set the torch dtype to bfloat16
model = Idefics3ForConditionalGeneration.from_pretrained(
base_model_path, torch_dtype=torch.bfloat16
).to(device) # Move the model to the specified device
# Load the adapter from the model ID and automatically map it to the device
model.load_adapter(model_id, device_map="auto")
# Define a function to infer a description from an image
@spaces.GPU
def infere(image):
"""
Generate a description of a medical image.
Args:
- image (PIL Image): The medical image to describe.
Returns:
- generated_texts (List[str]): A list containing the generated description.
"""
# Define a chat template for the model to respond to
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."},
]
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What do we see in this image?"},
]
},
]
# Apply the chat template and add a generation prompt
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# Preprocess the input image and text
inputs = processor(text=prompt, images=[image], return_tensors="pt")
# Move the inputs to the specified device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate a description with the model
generated_ids = model.generate(**inputs, max_new_tokens=100)
# Decode the generated IDs into text
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
return generated_texts
# Define the title, description, and device description for the Gradio interface
title = f"IDEFICS3_ROCO: Medical Image to Text running on {device}"
desc = "This model generates a description of a medical image."
device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. Please contact us."
# Define the long description for the Gradio interface
long_desc = f"This demo is based on the IDEFICS3_ROCO model, which is a multimodal model that can generate text from images. It has been fine-tuned on eltorio/ROCO-radiology a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!
{device_desc}
2024 - Ronan Le Meillat"
# Create a Gradio interface with the infere function and specified title and descriptions
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title,
description=desc, article=long_desc)
# Launch the Gradio interface and share it
radiotest.launch()