File size: 4,162 Bytes
3255105
 
 
 
 
 
d70db54
 
3255105
d98b4df
d70db54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3255105
 
 
d70db54
 
 
 
3255105
 
 
 
d70db54
 
 
3255105
 
 
 
d98b4df
3255105
d98b4df
3255105
 
 
 
d70db54
3255105
 
d70db54
 
 
 
 
3255105
d70db54
3255105
 
 
 
 
 
 
 
 
 
 
 
 
d98b4df
d70db54
 
3255105
 
 
 
 
 
 
 
 
 
d98b4df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3255105
 
 
d98b4df
3255105
d98b4df
3255105
 
 
 
 
 
 
 
 
 
d98b4df
3255105
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from PIL import Image
import torchvision.datasets as datasets
import numpy as np
import os

def load_model():
    # Create offload directory
    os.makedirs("offload", exist_ok=True)
    
    # Configure device map for memory efficiency
    device_map = {
        'base_model.model.model.embed_tokens': 0,
        'base_model.model.model.layers.0': 0,
        'base_model.model.model.layers.1': 0,
        'base_model.model.model.layers.2': 0,
        'base_model.model.model.layers.3': 0,
        'base_model.model.model.layers.4': 'cpu',
        'base_model.model.model.layers.5': 'cpu',
        'base_model.model.model.layers.6': 'cpu',
        'base_model.model.model.layers.7': 'cpu',
        'base_model.model.model.layers.8': 'cpu',
        'base_model.model.model.norm': 'cpu',
        'base_model.model.lm_head': 0,
    }
    
    base_model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-3-mini-4k-instruct",
        trust_remote_code=True,
        device_map=device_map,  # Use custom device map
        torch_dtype=torch.float32,
        attn_implementation='eager',
        offload_folder="offload"
    )
    
    model = PeftModel.from_pretrained(
        base_model,
        "jatingocodeo/phi-vlm",
        device_map=device_map,
        offload_folder="offload"
    )
    
    tokenizer = AutoTokenizer.from_pretrained("jatingocodeo/phi-vlm")
    
    return model, tokenizer

def generate_description(image, model, tokenizer):
    # Convert image to RGB if needed
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    # Resize image to match training size (32x32)
    image = image.resize((32, 32))
    
    # Convert image to tensor and normalize
    image_tensor = torch.FloatTensor(np.array(image)).permute(2, 0, 1) / 255.0
    
    # Prepare prompt with image tensor
    prompt = f"""Below is an image. Please describe it in detail.

Image: {image_tensor}
Description: """
    
    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(model.device)
    
    # Generate description
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            top_p=0.9
        )
    
    # Decode and return the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text.split("Description: ")[-1].strip()

# Load model
print("Loading model...")
model, tokenizer = load_model()

# Get CIFAR10 examples
def get_cifar_examples():
    cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True)
    classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
              'dog', 'frog', 'horse', 'ship', 'truck']
    
    examples = []
    used_classes = set()
    
    for idx in range(len(cifar10_test)):
        img, label = cifar10_test[idx]
        if classes[label] not in used_classes:
            img_path = f"examples/{classes[label]}_example.jpg"
            img.save(img_path)
            examples.append(img_path)
            used_classes.add(classes[label])
            
        if len(used_classes) == 10:
            break
    
    return examples

# Create Gradio interface
def process_image(image):
    return generate_description(image, model, tokenizer)

# Get examples
examples = get_cifar_examples()

# Define interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Description"),
    title="Image Description Generator",
    description="""Upload an image and get a detailed description generated by our fine-tuned VLM model.
                  Below are sample images from CIFAR10 dataset that you can try.""",
    examples=[[ex] for ex in examples]
)

# Launch the interface
if __name__ == "__main__":
    iface.launch()