oscarfu0501's picture
Update handler.py
24536fd verified
from typing import Dict, List, Any
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from io import BytesIO
import base64
import requests
# handle clip embeddings by utilizing openAI CLIP pretrained model
class EndpointHandler():
def __init__(self, path=""):
# Preload all the elements you we need at inference.
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
# inputs = self.processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
# print(probs)
# data = {
# "inputs": {
# "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
# "text": ["a photo of a cat", "a photo of a dog"]
# },
# }
inputs = data.get("inputs")
imageBase64 = inputs.get("image")
# imageURL = inputs.get("image")
text = inputs.get("text")
# print(imageURL)
# print(text)
# image = Image.open(requests.get(imageBase64, stream=True).raw)
image = Image.open(BytesIO(base64.b64decode(imageBase64.split(",")[1].encode())))
inputs = self.processor(text=text, images=image, return_tensors="pt", padding=True)
outputs = self.model(**inputs)
embeddings = outputs.image_embeds.detach().numpy().flatten().tolist()
return { "embeddings": embeddings }