import gradio as gr from transformers import OwlViTProcessor, OwlViTForObjectDetection from PIL import Image # Load the OWL-ViT model and processor processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") # Define the prediction function def predict(image): image = Image.open(image) text_queries = ["A photo of a pokemon", "a photo of a human face", "a photo of a couch"] # Example queries # Prepare inputs for the model inputs = processor(text=text_queries, images=image, return_tensors="pt") # Perform inference outputs = model(**inputs) # Format the response (dummy response as example) response = {"message": "Detection successful!"} return response # Create a Gradio interface and enable the API mode interface = gr.Interface( fn=predict, inputs="image", outputs="json", allow_flagging="never", live=True # Allows the API endpoint to remain active ) # Launch the interface with API mode enabled interface.launch(server_name="0.0.0.0", server_port=7860, share=True)