import spaces import gradio as gr import torch from PIL import Image from transformers import AutoModelForImageTextToText, MllamaForConditionalGeneration, AutoProcessor from transformers import TextStreamer from torchvision.transforms import Resize from unsloth import FastVisionModel # Define the model and processor model_id = "0llheaven/Llama-3.2-11B-Vision-Radiology-mini" # device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForImageTextToText.from_pretrained( model_id, # load_in_4bit=True, torch_dtype=torch.bfloat16, device_map="auto", ) model.gradient_checkpointing_enable() processor = AutoProcessor.from_pretrained(model_id) @spaces.GPU(duration=120) # Function to process the image and generate the description def generate_description(image: Image.Image, instruction: str): FastVisionModel.for_inference(model) print("กำลังโหลด tokenizer...") base_model, tokenizer = FastVisionModel.from_pretrained( "unsloth/Llama-3.2-11B-Vision-Instruct", # load_in_4bit = True, use_gradient_checkpointing = "unsloth", ) image = image.convert("RGB") # image = Resize((224, 224))(image) # Create the message to pass to the model instruction = "You are an expert radiographer. Describe accurately what you see in this image." messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction} ]} ] input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) # input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = tokenizer( image, input_text, add_special_tokens=False, return_tensors="pt" ).to(model.device) # Generate the output from the model # output = model.generate(**inputs, max_new_tokens=256) text_streamer = TextStreamer(tokenizer, skip_prompt=True) outputs = model.generate( **inputs, streamer=text_streamer, max_new_tokens=256, use_cache=True, temperature=1.5, min_p=0.1 ) return tokenizer.decode(outputs[0], skip_special_tokens=True).strip() # Define Gradio interface interface = gr.Interface( fn=generate_description, inputs=gr.Image(type="pil", label="Upload an Image"), outputs=gr.Textbox(label="Generated Description"), # live=True, title="Radiology Image Description Generator", description="Upload an image and provide an instruction to generate a description using a vision-language model." ) # Launch the interface interface.launch()