import gradio as gr import torch from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor from transformers import TextStreamer from torchvision.transforms import Resize # Define the model and processor model_id = "0llheaven/Llama-3.2-11B-Vision-Radiology-mini" device = "cuda" if torch.cuda.is_available() else "cpu" model = MllamaForConditionalGeneration.from_pretrained( model_id, load_in_4bit=True, torch_dtype=torch.bfloat16, device_map=device, ) model.gradient_checkpointing_enable() processor = AutoProcessor.from_pretrained(model_id) # Function to process the image and generate the description def generate_description(image: Image.Image, instruction: str): image = image.convert("RGB") # image = Resize((224, 224))(image) # Create the message to pass to the model instruction = "You are an expert radiographer. Describe accurately what you see in this image." messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor( image, input_text, add_special_tokens=False, return_tensors="pt" ).to(model.device) # Generate the output from the model output = model.generate(**inputs, max_new_tokens=256) return processor.decode(output[0]) # Define Gradio interface interface = gr.Interface( fn=generate_description, inputs=gr.Image(type="pil", label="Upload an Image"), outputs=gr.Textbox(label="Generated Description"), live=True, title="Radiology Image Description Generator", description="Upload an image and provide an instruction to generate a description using a vision-language model." ) # Launch the interface interface.launch()