import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import warnings # Suppress warnings warnings.filterwarnings('ignore') # Ensure CUDA device is used torch.set_default_device('cuda') # Load the model and tokenizer model_name = 'qnguyen3/nanoLLaVA-1.5' try: model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map='auto', trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) except ImportError as e: print("Error: Missing required dependencies. Make sure flash_attn is installed.") raise e # Function to describe the uploaded image def describe_image(image, prompt="Describe this image in detail"): messages = [{"role": "user", "content": f'\n{prompt}'}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize the text text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('')] input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0) # Process the image image_tensor = model.process_images([image], model.config).to(dtype=model.dtype) # Generate a response output_ids = model.generate( input_ids, images=image_tensor, max_new_tokens=2048, use_cache=True )[0] # Decode and return the response description = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() return description # Set up the Gradio interface gr.Interface( fn=describe_image, inputs=[gr.inputs.Image(type="pil"), gr.inputs.Textbox(default="Describe this image in detail")], outputs="text", title="Image Description Model", description="Upload an image and receive a detailed description." ).launch()