tcm03
/

tsbir

Feature Extraction

generic

text-sketch

endpoints-template

Model card Files Files and versions Community

tcm03 commited on Dec 17, 2024

Commit

5c18c06

1 Parent(s): 9e0347a

Modify inference.py and Add YAML metadata for Hugging Face Hub

Browse files

Files changed (2) hide show

README.md +11 -0
inference.py +30 -10

README.md CHANGED Viewed

@@ -1,3 +1,14 @@
 # Image Retrieval with Text and Sketch
 This code is for our 2022 ECCV paper [A Sketch Is Worth a Thousand Words: Image Retrieval with Text and Sketch](https://patsorn.me/projects/tsbir/)

+---
+tags:
+- image-retrieval
+- text-sketch
+- clip
+- pytorch
+- inference
+library_name: pytorch
+inference: true
+---
 # Image Retrieval with Text and Sketch
 This code is for our 2022 ECCV paper [A Sketch Is Worth a Thousand Words: Image Retrieval with Text and Sketch](https://patsorn.me/projects/tsbir/)

inference.py CHANGED Viewed

@@ -2,21 +2,40 @@ import torch
 from PIL import Image
 import base64
 from io import BytesIO
-from transformers import AutoTokenizer
 import sys
 sys.path.append("code")
 from clip.model import CLIP
-# Load Model and Utilities
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = CLIP.from_pretrained("tcm03/tsbir").to(device)
-model.eval()
-# Preprocessing Functions
-from clip.clip import _transform, tokenize
-transformer = _transform(model.visual.input_resolution, is_train=False)
 def preprocess_image(image_base64):
     """Convert base64 encoded image to tensor."""
     image = Image.open(BytesIO(base64.b64decode(image_base64))).convert("RGB")
@@ -49,16 +68,17 @@ def get_fused_embedding(image_base64, text):
 # Hugging Face Inference API Entry Point
 def infer(inputs):
     """
-    Inference API entry point.
     Inputs:
       - 'image': Base64 encoded sketch image.
       - 'text': Text query.
     """
     image_base64 = inputs.get("image", "")
     text_query = inputs.get("text", "")
     if not image_base64 or not text_query:
         return {"error": "Both 'image' (base64) and 'text' are required inputs."}
     # Generate Fused Embedding
     fused_embedding = get_fused_embedding(image_base64, text_query)
     return {"fused_embedding": fused_embedding}

 from PIL import Image
 import base64
 from io import BytesIO
+import json
 import sys
 sys.path.append("code")
 from clip.model import CLIP
+from clip.clip import _transform, tokenize
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "model/tsbir_model_final.pt"
+CONFIG_PATH = "code/training/model_configs/ViT-B-16.json"
+def load_model():
+    """Load the model only once."""
+    global model
+    if "model" not in globals():
+        with open(CONFIG_PATH, 'r') as f:
+            model_info = json.load(f)
+        model = CLIP(**model_info)
+        checkpoint = torch.load(MODEL_PATH, map_location=device)
+        sd = checkpoint["state_dict"]
+        if next(iter(sd.items()))[0].startswith('module'):
+            sd = {k[len('module.'):]: v for k, v in sd.items()}
+        model.load_state_dict(sd, strict=False)
+        model = model.to(device).eval()
+        # Initialize transformer
+        global transformer
+        transformer = _transform(model.visual.input_resolution, is_train=False)
+        print("Model loaded successfully.")
+# Preprocessing Functions
 def preprocess_image(image_base64):
     """Convert base64 encoded image to tensor."""
     image = Image.open(BytesIO(base64.b64decode(image_base64))).convert("RGB")
 # Hugging Face Inference API Entry Point
 def infer(inputs):
     """
+    Inference API entry point.
     Inputs:
       - 'image': Base64 encoded sketch image.
       - 'text': Text query.
     """
+    load_model()  # Ensure the model is loaded once
     image_base64 = inputs.get("image", "")
     text_query = inputs.get("text", "")
     if not image_base64 or not text_query:
         return {"error": "Both 'image' (base64) and 'text' are required inputs."}
     # Generate Fused Embedding
     fused_embedding = get_fused_embedding(image_base64, text_query)
     return {"fused_embedding": fused_embedding}