Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This is a quantized version of https://huggingface.co/laion/CLIP-ViT-B-32-256x256-DataComp-s34B-b86K that is ready to use with (DeepSparse)[https://github.com/neuralmagic/deepsparse]
|
2 |
+
|
3 |
+
It achieves 71.1% one-shot accuracy on ImageNet.
|
4 |
+
|
5 |
+
## Usage
|
6 |
+
|
7 |
+
First, install DeepSparse with extensions for CLIP:
|
8 |
+
```
|
9 |
+
pip install deepsparse-nightly[clip]>=1.7.0.20231210
|
10 |
+
```
|
11 |
+
|
12 |
+
Download some test images of a church, a dog, and elephants:
|
13 |
+
```
|
14 |
+
wget -O basilica.jpg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/src/deepsparse/yolo/sample_images/basilica.jpg
|
15 |
+
wget -O buddy.jpeg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/tests/deepsparse/pipelines/sample_images/buddy.jpeg
|
16 |
+
wget -O thailand.jpg https://raw.githubusercontent.com/neuralmagic/deepsparse/main/src/deepsparse/yolact/sample_images/thailand.jpg
|
17 |
+
```
|
18 |
+
|
19 |
+
Then make and run a pipeline in Python:
|
20 |
+
```python
|
21 |
+
import numpy as np
|
22 |
+
from deepsparse import Pipeline
|
23 |
+
from deepsparse.clip import (
|
24 |
+
CLIPTextInput,
|
25 |
+
CLIPVisualInput,
|
26 |
+
CLIPZeroShotInput
|
27 |
+
)
|
28 |
+
|
29 |
+
def new_process_inputs(self, inputs: CLIPTextInput):
|
30 |
+
if not isinstance(inputs.text, list):
|
31 |
+
inputs.text = [inputs.text]
|
32 |
+
if not isinstance(inputs.text[0], str):
|
33 |
+
return inputs.text
|
34 |
+
tokens = [np.array(t).astype(np.int32) for t in self.tokenizer(inputs.text)]
|
35 |
+
tokens = np.stack(tokens, axis=0)
|
36 |
+
tokens_lengths = np.array(tokens.shape[0] * [tokens.shape[1] - 1])
|
37 |
+
return [tokens, tokens_lengths]
|
38 |
+
|
39 |
+
# This overrides the process_inputs function globally for all CLIPTextPipeline classes,
|
40 |
+
# so when we make a zeroshot pipeline later that uses this class, it will use this edit!
|
41 |
+
CLIPTextPipeline.process_inputs = new_process_inputs
|
42 |
+
|
43 |
+
possible_classes = ["ice cream", "an elephant", "a dog", "a building", "a church"]
|
44 |
+
images = ["basilica.jpg", "buddy.jpeg", "thailand.jpg"]
|
45 |
+
|
46 |
+
pipeline = Pipeline.create(task="clip_zeroshot", visual_model_path="visual.onnx", text_model_path="textual.onnx")
|
47 |
+
|
48 |
+
pipeline_input = CLIPZeroShotInput(
|
49 |
+
image=CLIPVisualInput(images=images),
|
50 |
+
text=CLIPTextInput(text=possible_classes),
|
51 |
+
)
|
52 |
+
|
53 |
+
output = pipeline(pipeline_input).text_scores
|
54 |
+
for i in range(len(output)):
|
55 |
+
prediction = possible_classes[np.argmax(output[i])]
|
56 |
+
print(f"Image {images[i]} is a picture of {prediction}")
|
57 |
+
|
58 |
+
"""
|
59 |
+
Image basilica.jpg is a picture of a church
|
60 |
+
Image buddy.jpeg is a picture of a dog
|
61 |
+
Image thailand.jpg is a picture of an elephant
|
62 |
+
"""
|
63 |
+
```
|