import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel, PeftConfig from PIL import Image import torchvision.datasets as datasets import numpy as np import os def load_model(): # Create offload directory os.makedirs("offload", exist_ok=True) # Configure device map for memory efficiency device_map = { 'base_model.model.model.embed_tokens': 0, 'base_model.model.model.layers.0': 0, 'base_model.model.model.layers.1': 0, 'base_model.model.model.layers.2': 0, 'base_model.model.model.layers.3': 0, 'base_model.model.model.layers.4': 'cpu', 'base_model.model.model.layers.5': 'cpu', 'base_model.model.model.layers.6': 'cpu', 'base_model.model.model.layers.7': 'cpu', 'base_model.model.model.layers.8': 'cpu', 'base_model.model.model.norm': 'cpu', 'base_model.model.lm_head': 0, } base_model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True, device_map=device_map, # Use custom device map torch_dtype=torch.float32, attn_implementation='eager', offload_folder="offload" ) model = PeftModel.from_pretrained( base_model, "jatingocodeo/phi-vlm", device_map=device_map, offload_folder="offload" ) tokenizer = AutoTokenizer.from_pretrained("jatingocodeo/phi-vlm") return model, tokenizer def generate_description(image, model, tokenizer): # Convert image to RGB if needed if image.mode != "RGB": image = image.convert("RGB") # Resize image to match training size (32x32) image = image.resize((32, 32)) # Convert image to tensor and normalize image_tensor = torch.FloatTensor(np.array(image)).permute(2, 0, 1) / 255.0 # Prepare prompt with image tensor prompt = f"""Below is an image. Please describe it in detail. Image: {image_tensor} Description: """ # Tokenize input inputs = tokenizer( prompt, return_tensors="pt", padding=True, truncation=True, max_length=128 ).to(model.device) # Generate description with torch.no_grad(): outputs = model.generate( input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100, temperature=0.7, do_sample=True, top_p=0.9 ) # Decode and return the generated text generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated_text.split("Description: ")[-1].strip() # Load model print("Loading model...") model, tokenizer = load_model() # Get CIFAR10 examples def get_cifar_examples(): cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True) classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] examples = [] used_classes = set() for idx in range(len(cifar10_test)): img, label = cifar10_test[idx] if classes[label] not in used_classes: img_path = f"examples/{classes[label]}_example.jpg" img.save(img_path) examples.append(img_path) used_classes.add(classes[label]) if len(used_classes) == 10: break return examples # Create Gradio interface def process_image(image): return generate_description(image, model, tokenizer) # Get examples examples = get_cifar_examples() # Define interface iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=gr.Textbox(label="Generated Description"), title="Image Description Generator", description="""Upload an image and get a detailed description generated by our fine-tuned VLM model. Below are sample images from CIFAR10 dataset that you can try.""", examples=[[ex] for ex in examples] ) # Launch the interface if __name__ == "__main__": iface.launch()