|
from typing import Dict, List, Any |
|
from PIL import Image |
|
from io import BytesIO |
|
import base64 |
|
import torch |
|
import open_clip |
|
|
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
self.model, self.preprocess, _ = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-H-14-laion2B-s32B-b79K') |
|
self.tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-H-14-laion2B-s32B-b79K') |
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
image_base64 = data.get("inputs", None) |
|
parameters = data.get("parameters", None) |
|
if image_base64 is None or parameters is None: |
|
raise ValueError("Input data or parameters not provided") |
|
|
|
candidate_labels = parameters.get("candidate_labels", None) |
|
if candidate_labels is None: |
|
raise ValueError("Candidate labels not provided") |
|
|
|
image = Image.open(BytesIO(base64.b64decode(image_base64))) |
|
image = self.preprocess(image).unsqueeze(0) |
|
text = self.tokenizer(candidate_labels) |
|
|
|
with torch.no_grad(): |
|
image_features = self.model.encode_image(image) |
|
text_features = self.model.encode_text(text) |
|
image_features /= image_features.norm(dim=-1, keepdim=True) |
|
text_features /= text_features.norm(dim=-1, keepdim=True) |
|
|
|
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) |
|
|
|
results = [{"label": label, "score": score.item()} for label, score in zip(candidate_labels, text_probs[0])] |
|
return results |
|
|