Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from peft import PeftModel, PeftConfig | |
from PIL import Image | |
import torchvision.datasets as datasets | |
import numpy as np | |
import os | |
def load_model(): | |
# Create offload directory | |
os.makedirs("offload", exist_ok=True) | |
# Configure device map for memory efficiency | |
device_map = { | |
'base_model.model.model.embed_tokens': 0, | |
'base_model.model.model.layers.0': 0, | |
'base_model.model.model.layers.1': 0, | |
'base_model.model.model.layers.2': 0, | |
'base_model.model.model.layers.3': 0, | |
'base_model.model.model.layers.4': 'cpu', | |
'base_model.model.model.layers.5': 'cpu', | |
'base_model.model.model.layers.6': 'cpu', | |
'base_model.model.model.layers.7': 'cpu', | |
'base_model.model.model.layers.8': 'cpu', | |
'base_model.model.model.norm': 'cpu', | |
'base_model.model.lm_head': 0, | |
} | |
base_model = AutoModelForCausalLM.from_pretrained( | |
"microsoft/Phi-3-mini-4k-instruct", | |
trust_remote_code=True, | |
device_map=device_map, # Use custom device map | |
torch_dtype=torch.float32, | |
attn_implementation='eager', | |
offload_folder="offload" | |
) | |
model = PeftModel.from_pretrained( | |
base_model, | |
"jatingocodeo/phi-vlm", | |
device_map=device_map, | |
offload_folder="offload" | |
) | |
tokenizer = AutoTokenizer.from_pretrained("jatingocodeo/phi-vlm") | |
return model, tokenizer | |
def generate_description(image, model, tokenizer): | |
# Convert image to RGB if needed | |
if image.mode != "RGB": | |
image = image.convert("RGB") | |
# Resize image to match training size (32x32) | |
image = image.resize((32, 32)) | |
# Convert image to tensor and normalize | |
image_tensor = torch.FloatTensor(np.array(image)).permute(2, 0, 1) / 255.0 | |
# Prepare prompt with image tensor | |
prompt = f"""Below is an image. Please describe it in detail. | |
Image: {image_tensor} | |
Description: """ | |
# Tokenize input | |
inputs = tokenizer( | |
prompt, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=128 | |
).to(model.device) | |
# Generate description | |
with torch.no_grad(): | |
outputs = model.generate( | |
input_ids=inputs.input_ids, | |
attention_mask=inputs.attention_mask, | |
max_new_tokens=100, | |
temperature=0.7, | |
do_sample=True, | |
top_p=0.9 | |
) | |
# Decode and return the generated text | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return generated_text.split("Description: ")[-1].strip() | |
# Load model | |
print("Loading model...") | |
model, tokenizer = load_model() | |
# Get CIFAR10 examples | |
def get_cifar_examples(): | |
cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True) | |
classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', | |
'dog', 'frog', 'horse', 'ship', 'truck'] | |
examples = [] | |
used_classes = set() | |
for idx in range(len(cifar10_test)): | |
img, label = cifar10_test[idx] | |
if classes[label] not in used_classes: | |
img_path = f"examples/{classes[label]}_example.jpg" | |
img.save(img_path) | |
examples.append(img_path) | |
used_classes.add(classes[label]) | |
if len(used_classes) == 10: | |
break | |
return examples | |
# Create Gradio interface | |
def process_image(image): | |
return generate_description(image, model, tokenizer) | |
# Get examples | |
examples = get_cifar_examples() | |
# Define interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type="pil"), | |
outputs=gr.Textbox(label="Generated Description"), | |
title="Image Description Generator", | |
description="""Upload an image and get a detailed description generated by our fine-tuned VLM model. | |
Below are sample images from CIFAR10 dataset that you can try.""", | |
examples=[[ex] for ex in examples] | |
) | |
# Launch the interface | |
if __name__ == "__main__": | |
iface.launch() |