rroset
/

CLIP-ViT-B-32-laion2B-s34B-b79K

Zero-Shot Image Classification

Inference Endpoints

Model card Files Files and versions Community

CLIP-ViT-B-32-laion2B-s34B-b79K / handler.py

rroset's picture

Create handler.py

ae82746 verified 4 months ago

1.56 kB

	from typing import Dict, List, Any
	from PIL import Image
	from io import BytesIO
	import base64
	import torch
	import open_clip

	class EndpointHandler():
	def __init__(self, path=""):
	self.model, self.preprocess, _ = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-H-14-laion2B-s32B-b79K')
	self.tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-H-14-laion2B-s32B-b79K')

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	image_base64 = data.get("inputs", None)
	parameters = data.get("parameters", None)
	if image_base64 is None or parameters is None:
	raise ValueError("Input data or parameters not provided")

	candidate_labels = parameters.get("candidate_labels", None)
	if candidate_labels is None:
	raise ValueError("Candidate labels not provided")

	image = Image.open(BytesIO(base64.b64decode(image_base64)))
	image = self.preprocess(image).unsqueeze(0)
	text = self.tokenizer(candidate_labels)

	with torch.no_grad():
	image_features = self.model.encode_image(image)
	text_features = self.model.encode_text(text)
	image_features /= image_features.norm(dim=-1, keepdim=True)
	text_features /= text_features.norm(dim=-1, keepdim=True)

	text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

	results = [{"label": label, "score": score.item()} for label, score in zip(candidate_labels, text_probs[0])]
	return results