File size: 4,416 Bytes
1d6cff4
 
 
 
 
 
ebf3ae4
 
 
d96f8ab
1d6cff4
 
d96f8ab
1d6cff4
 
 
 
 
 
 
4784163
1d6cff4
 
ebf3ae4
1d6cff4
 
ebf3ae4
1d6cff4
 
ebf3ae4
1d6cff4
d96f8ab
ebf3ae4
1d6cff4
 
 
 
 
 
 
 
 
 
 
ebf3ae4
a64397f
1d6cff4
 
 
 
 
 
 
 
 
 
 
a64397f
ebf3ae4
1d6cff4
 
ebf3ae4
1d6cff4
 
ebf3ae4
1d6cff4
 
ebf3ae4
1d6cff4
 
a64397f
1d6cff4
 
ebf3ae4
1d6cff4
ebf3ae4
 
1d6cff4
 
3d34438
1d6cff4
d96f8ab
1d6cff4
 
0ec0460
1d6cff4
 
 
 
 
 
a315cb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Copyright 2024 Ronan Le Meillat
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Import necessary libraries
import gradio as gr
from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils
import torch
import spaces

# Determine the device (GPU or CPU) to run the model on
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")  # Log the device being used

# Define the model ID and base model path
model_id = "eltorio/IDEFICS3_ROCO"
base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3"  # or change to local path

# Initialize the processor from the base model path
processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True)

# Initialize the model from the base model path and set the torch dtype to bfloat16
model = Idefics3ForConditionalGeneration.from_pretrained(
    base_model_path, torch_dtype=torch.bfloat16
).to(device)  # Move the model to the specified device

# Load the adapter from the model ID and automatically map it to the device
model.load_adapter(model_id, device_map="auto")

# Define a function to infer a description from an image
@spaces.GPU
def infere(image):
    """
    Generate a description of a medical image.

    Args:
    - image (PIL Image): The medical image to describe.

    Returns:
    - generated_texts (List[str]): A list containing the generated description.
    """

    # Define a chat template for the model to respond to
    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."},
            ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What do we see in this image?"},
            ]
        },
    ]

    # Apply the chat template and add a generation prompt
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

    # Preprocess the input image and text
    inputs = processor(text=prompt, images=[image], return_tensors="pt")

    # Move the inputs to the specified device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate a description with the model
    generated_ids = model.generate(**inputs, max_new_tokens=100)

    # Decode the generated IDs into text
    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

    return generated_texts

# Define the title, description, and device description for the Gradio interface
title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>"
desc = "This model generates a description of a medical image.<br><b>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please direct your citations and likes to the original work.</b>"

device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>"

# Define the long description for the Gradio interface
long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a>&nbsp;a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat"

# Create a Gradio interface with the infere function and specified title and descriptions
radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, 
                description=desc, article=long_desc)

# Launch the Gradio interface and share it
radiotest.launch()