import torch from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image import gradio as gr # Define the folder where the model and processor are saved saved_folder_path = "model_folder" # Replace with the path to your model folder # Load processor and model processor = AutoProcessor.from_pretrained(saved_folder_path) # Processor (e.g., feature extractor + tokenizer) model = AutoModelForVision2Seq.from_pretrained(saved_folder_path) # Pre-trained BLIP model model.eval() # Set model to evaluation mode # Define the caption generation function def generate_caption(image): # Convert the input image to PIL format (if necessary) image = Image.fromarray(image) # Preprocess the image using the processor inputs = processor(images=image, return_tensors="pt") pixel_values = inputs.pixel_values # Generate caption generated_ids = model.generate(pixel_values=pixel_values, max_length=50) generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return generated_caption # Define the Gradio interface interface = gr.Interface( fn=generate_caption, # Function to process input inputs=gr.Image(), # Input as image outputs=gr.Textbox(), # Output as text live=True # Enable live prediction ) # Launch the Gradio app interface.launch()