MakeNTU-v2024
/

image_recognition_model_org

Model card Files Files and versions Community

image_recognition_model_org / handler.py

oscarfu0501's picture

Update handler.py

24536fd verified about 1 year ago

history blame contribute delete

1.83 kB

	from typing import Dict, List, Any
	import numpy as np
	from transformers import CLIPProcessor, CLIPModel
	from PIL import Image
	from io import BytesIO
	import base64
	import requests

	# handle clip embeddings by utilizing openAI CLIP pretrained model
	class EndpointHandler():
	def __init__(self, path=""):
	# Preload all the elements you we need at inference.
	self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
	self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	# inputs = self.processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
	# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	# probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
	# print(probs)

	# data = {
	# "inputs": {
	# "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
	# "text": ["a photo of a cat", "a photo of a dog"]
	# },
	# }

	inputs = data.get("inputs")
	imageBase64 = inputs.get("image")
	# imageURL = inputs.get("image")
	text = inputs.get("text")
	# print(imageURL)
	# print(text)
	# image = Image.open(requests.get(imageBase64, stream=True).raw)

	image = Image.open(BytesIO(base64.b64decode(imageBase64.split(",")[1].encode())))
	inputs = self.processor(text=text, images=image, return_tensors="pt", padding=True)
	outputs = self.model(**inputs)
	embeddings = outputs.image_embeds.detach().numpy().flatten().tolist()
	return { "embeddings": embeddings }