neuralmagic
/

CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds

Zero-Shot Classification

ONNX

deepsparse

Model card Files Files and versions Community

mgoin commited on Dec 19, 2023

Commit

43517d5

1 Parent(s): 6f1ad15

Update README.md

Browse files

Files changed (1) hide show

README.md +27 -15

README.md CHANGED Viewed

@@ -16,17 +16,12 @@ wget -O buddy.jpeg https://raw.githubusercontent.com/neuralmagic/deepsparse/main
 wget -O thailand.jpg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/src/deepsparse/yolact/sample_images/thailand.jpg
 ```
-Then make and run a pipeline in Python:
 ```python
 import numpy as np
-from deepsparse import Pipeline
-from deepsparse.clip import (
-    CLIPTextInput,
-    CLIPVisualInput,
-    CLIPZeroShotInput
-)
-def new_process_inputs(self, inputs: CLIPTextInput):
     if not isinstance(inputs.text, list):
         inputs.text = [inputs.text]
     if not isinstance(inputs.text[0], str):
@@ -36,21 +31,38 @@ def new_process_inputs(self, inputs: CLIPTextInput):
     tokens_lengths = np.array(tokens.shape[0] * [tokens.shape[1] - 1])
     return [tokens, tokens_lengths]
-# This overrides the process_inputs function globally for all CLIPTextPipeline classes,
-# so when we make a zeroshot pipeline later that uses this class, it will use this edit!
-CLIPTextPipeline.process_inputs = new_process_inputs
 possible_classes = ["ice cream", "an elephant", "a dog", "a building", "a church"]
 images = ["basilica.jpg", "buddy.jpeg", "thailand.jpg"]
-pipeline = Pipeline.create(task="clip_zeroshot", visual_model_path="visual.onnx", text_model_path="textual.onnx")
-pipeline_input = CLIPZeroShotInput(
     image=CLIPVisualInput(images=images),
     text=CLIPTextInput(text=possible_classes),
-)
-output = pipeline(pipeline_input).text_scores
 for i in range(len(output)):
     prediction = possible_classes[np.argmax(output[i])]
     print(f"Image {images[i]} is a picture of {prediction}")

 wget -O thailand.jpg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/src/deepsparse/yolact/sample_images/thailand.jpg
 ```
+For this model there is a second input that is the length of tokens, so run this input override before making the pipeline:
 ```python
 import numpy as np
+from deepsparse.clip import CLIPTextPipeline
+def custom_process_inputs(self, inputs):
     if not isinstance(inputs.text, list):
         inputs.text = [inputs.text]
     if not isinstance(inputs.text[0], str):
     tokens_lengths = np.array(tokens.shape[0] * [tokens.shape[1] - 1])
     return [tokens, tokens_lengths]
+# This overrides the process_inputs function globally for all CLIPTextPipeline classes
+CLIPTextPipeline.process_inputs = custom_process_inputs
+```
+Then make and run a pipeline in Python:
+```python
+from deepsparse import Pipeline
+from deepsparse.clip import (
+    CLIPTextInput,
+    CLIPVisualInput,
+    CLIPZeroShotInput
+)
+from huggingface_hub import snapshot_download
+# Download the model from HF
+model_folder = snapshot_download(repo_id="mgoin/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K-quant-ds")
 possible_classes = ["ice cream", "an elephant", "a dog", "a building", "a church"]
 images = ["basilica.jpg", "buddy.jpeg", "thailand.jpg"]
+# Load the model into DeepSparse
+pipeline = Pipeline.create(
+    task="clip_zeroshot",
+    visual_model_path=model_folder + "/visual.onnx",
+    text_model_path=model_folder + "/textual.onnx"
+)
+output = pipeline(
     image=CLIPVisualInput(images=images),
     text=CLIPTextInput(text=possible_classes),
+).text_scores
 for i in range(len(output)):
     prediction = possible_classes[np.argmax(output[i])]
     print(f"Image {images[i]} is a picture of {prediction}")