File size: 2,947 Bytes
16400a1
21795f4
e094577
4b589f5
ddfe551
e094577
 
ddfe551
e094577
 
 
 
70ce441
 
 
0a05d96
ddfe551
e094577
47bd524
70ce441
 
 
e094577
70ce441
 
 
e094577
 
 
16400a1
e094577
 
16400a1
 
 
 
 
 
 
 
 
e094577
 
 
 
 
 
 
 
 
 
 
 
ddfe551
 
 
e094577
 
 
 
70ce441
e094577
 
ddfe551
 
 
 
 
 
 
 
 
 
 
e094577
 
 
 
 
 
0a05d96
e094577
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import spaces
import gradio as gr
import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, MllamaForConditionalGeneration, AutoProcessor
from transformers import TextStreamer
from torchvision.transforms import Resize
from unsloth import FastVisionModel

# Define the model and processor
model_id = "0llheaven/Llama-3.2-11B-Vision-Radiology-mini"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    # load_in_4bit=True,
    torch_dtype=torch.float32 if device.type == "cpu" else torch.bfloat16,
    device_map="auto" if device.type == "cuda" else None,
).to(device)

if device.type == "cuda":
    model.gradient_checkpointing_enable()
# model.gradient_checkpointing_enable()

processor = AutoProcessor.from_pretrained(model_id)

@spaces.GPU(duration=120)
# Function to process the image and generate the description
def generate_description(image: Image.Image, instruction: str):

    FastVisionModel.for_inference(model)
    print("กำลังโหลด tokenizer...")
    base_model, tokenizer = FastVisionModel.from_pretrained(
        "unsloth/Llama-3.2-11B-Vision-Instruct",
        # load_in_4bit = True,
        use_gradient_checkpointing = "unsloth",
    )
    
    image = image.convert("RGB")
    # image = Resize((224, 224))(image)

    # Create the message to pass to the model
    instruction = "You are an expert radiographer. Describe accurately what you see in this image."
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]

    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    # input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt"
    ).to(device)

    # Generate the output from the model
    # output = model.generate(**inputs, max_new_tokens=256)
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    outputs = model.generate(
            **inputs, 
            streamer=text_streamer,
            max_new_tokens=256,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# Define Gradio interface
interface = gr.Interface(
    fn=generate_description,
    inputs=gr.Image(type="pil", label="Upload an Image"),
    outputs=gr.Textbox(label="Generated Description"),
    # live=True,
    title="Radiology Image Description Generator",
    description="Upload an image and provide an instruction to generate a description using a vision-language model."
)

# Launch the interface
interface.launch()