# Copyright 2024 Ronan Le Meillat # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Import necessary libraries import gradio as gr from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils import torch import spaces # Determine the device (GPU or CPU) to run the model on device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"Using device: {device}") # Log the device being used # Define the model ID and base model path model_id = "eltorio/IDEFICS3_ROCO" base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" # or change to local path # Initialize the processor from the base model path processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True) # Initialize the model from the base model path and set the torch dtype to bfloat16 model = Idefics3ForConditionalGeneration.from_pretrained( base_model_path, torch_dtype=torch.bfloat16 ).to(device) # Move the model to the specified device # Load the adapter from the model ID and automatically map it to the device model.load_adapter(model_id, device_map="auto") # Define a function to infer a description from an image @spaces.GPU def infere(image): """ Generate a description of a medical image. Args: - image (PIL Image): The medical image to describe. Returns: - generated_texts (List[str]): A list containing the generated description. """ # Define a chat template for the model to respond to messages = [ { "role": "system", "content": [ {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."}, ] }, { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": "What do we see in this image?"}, ] }, ] # Apply the chat template and add a generation prompt prompt = processor.apply_chat_template(messages, add_generation_prompt=True) # Preprocess the input image and text inputs = processor(text=prompt, images=[image], return_tensors="pt") # Move the inputs to the specified device inputs = {k: v.to(device) for k, v in inputs.items()} # Generate a description with the model generated_ids = model.generate(**inputs, max_new_tokens=100) # Decode the generated IDs into text generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) return generated_texts # Define the title, description, and device description for the Gradio interface title = f"IDEFICS3_ROCO: Medical Image to Text running on {device}" desc = "This model generates a description of a medical image." device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. Please contact us." # Define the long description for the Gradio interface long_desc = f"This demo is based on the IDEFICS3_ROCO model, which is a multimodal model that can generate text from images. It has been fine-tuned on eltorio/ROCO-radiology a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!
{device_desc}
2024 - Ronan Le Meillat" # Create a Gradio interface with the infere function and specified title and descriptions radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, description=desc, article=long_desc) # Launch the Gradio interface and share it radiotest.launch()