File size: 4,416 Bytes
1d6cff4 ebf3ae4 d96f8ab 1d6cff4 d96f8ab 1d6cff4 4784163 1d6cff4 ebf3ae4 1d6cff4 ebf3ae4 1d6cff4 ebf3ae4 1d6cff4 d96f8ab ebf3ae4 1d6cff4 ebf3ae4 a64397f 1d6cff4 a64397f ebf3ae4 1d6cff4 ebf3ae4 1d6cff4 ebf3ae4 1d6cff4 ebf3ae4 1d6cff4 a64397f 1d6cff4 ebf3ae4 1d6cff4 ebf3ae4 1d6cff4 3d34438 1d6cff4 d96f8ab 1d6cff4 0ec0460 1d6cff4 a315cb3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# Copyright 2024 Ronan Le Meillat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Import necessary libraries
import gradio as gr
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils
import torch
import spaces
# Determine the device (GPU or CPU) to run the model on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}") # Log the device being used
# Define the model ID and base model path
model_id = "eltorio/IDEFICS3_ROCO"
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" # or change to local path
# Initialize the processor from the base model path
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)
# Initialize the model from the base model path and set the torch dtype to bfloat16
model = Idefics3ForConditionalGeneration.from_pretrained(
base_model_path, torch_dtype=torch.bfloat16
).to(device) # Move the model to the specified device
# Load the adapter from the model ID and automatically map it to the device
model.load_adapter(model_id, device_map="auto")
# Define a function to infer a description from an image
@spaces.GPU
def infere(image):
"""
Generate a description of a medical image.
Args:
- image (PIL Image): The medical image to describe.
Returns:
- generated_texts (List[str]): A list containing the generated description.
"""
# Define a chat template for the model to respond to
messages = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."},
]
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What do we see in this image?"},
]
},
]
# Apply the chat template and add a generation prompt
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# Preprocess the input image and text
inputs = processor(text=prompt, images=[image], return_tensors="pt")
# Move the inputs to the specified device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate a description with the model
generated_ids = model.generate(**inputs, max_new_tokens=100)
# Decode the generated IDs into text
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
return generated_texts
# Define the title, description, and device description for the Gradio interface
title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>"
desc = "This model generates a description of a medical image.<br><b>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please direct your citations and likes to the original work.</b>"
device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>"
# Define the long description for the Gradio interface
long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a> a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat"
# Create a Gradio interface with the infere function and specified title and descriptions
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title,
description=desc, article=long_desc)
# Launch the Gradio interface and share it
radiotest.launch() |