Update README.md
#4
by
bwang0911
- opened
README.md
CHANGED
@@ -168,18 +168,22 @@ from transformers import AutoModel
|
|
168 |
# Initialize the model
|
169 |
model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
|
170 |
|
171 |
-
#
|
172 |
sentences = [
|
173 |
-
"
|
174 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
]
|
176 |
|
177 |
-
# Public image URLs
|
178 |
-
image_urls = [
|
179 |
-
"https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg",
|
180 |
-
"https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg",
|
181 |
-
]
|
182 |
|
|
|
|
|
183 |
# Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
|
184 |
truncate_dim = 512
|
185 |
|
@@ -190,16 +194,23 @@ image_embeddings = model.encode_image(
|
|
190 |
) # also accepts PIL.image, local filenames, dataURI
|
191 |
|
192 |
# Encode query text
|
193 |
-
query = "
|
194 |
-
|
195 |
query, task="retrieval.query", truncate_dim=truncate_dim
|
196 |
)
|
197 |
|
198 |
-
#
|
199 |
-
print(
|
200 |
-
|
201 |
-
print(
|
202 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
```
|
204 |
|
205 |
or via sentence-transformers:
|
@@ -214,64 +225,26 @@ model = SentenceTransformer(
|
|
214 |
"jinaai/jina-clip-v2", trust_remote_code=True, truncate_dim=truncate_dim
|
215 |
)
|
216 |
|
217 |
-
#
|
218 |
sentences = [
|
219 |
-
"
|
220 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
]
|
222 |
|
223 |
-
# Public image URLs
|
224 |
-
image_urls = [
|
225 |
-
"https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg",
|
226 |
-
"https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg",
|
227 |
-
]
|
228 |
|
229 |
text_embeddings = model.encode(sentences)
|
230 |
image_embeddings = model.encode(image_urls)
|
231 |
-
query = "
|
232 |
-
|
233 |
-
```
|
234 |
-
|
235 |
-
JavaScript developers can use Jina CLIP via the [transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
|
236 |
-
|
237 |
-
```js
|
238 |
-
import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
|
239 |
-
|
240 |
-
// Load tokenizer and text model
|
241 |
-
const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v2');
|
242 |
-
const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v2');
|
243 |
-
|
244 |
-
// Load processor and vision model
|
245 |
-
const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
|
246 |
-
const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v2');
|
247 |
-
|
248 |
-
// Run tokenization
|
249 |
-
const texts = [
|
250 |
-
'A neural network walks into a bar and forgets why it came.',
|
251 |
-
'Why do programmers prefer dark mode? Because light attracts bugs.',
|
252 |
-
];
|
253 |
-
const text_inputs = tokenizer(texts, { padding: true, truncation: true });
|
254 |
-
|
255 |
-
// Compute text embeddings
|
256 |
-
const { text_embeds } = await text_model(text_inputs);
|
257 |
-
|
258 |
-
// Read images and run processor
|
259 |
-
const urls = [
|
260 |
-
'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
|
261 |
-
'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
|
262 |
-
];
|
263 |
-
const image = await Promise.all(urls.map(url => RawImage.read(url)));
|
264 |
-
const image_inputs = await processor(image);
|
265 |
-
|
266 |
-
// Compute vision embeddings
|
267 |
-
const { image_embeds } = await vision_model(image_inputs);
|
268 |
-
|
269 |
-
// Compute similarities
|
270 |
-
console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
|
271 |
-
console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
|
272 |
-
console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
|
273 |
-
console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
|
274 |
-
console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
|
275 |
```
|
276 |
|
277 |
|
|
|
168 |
# Initialize the model
|
169 |
model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
|
170 |
|
171 |
+
# Corpus
|
172 |
sentences = [
|
173 |
+
"طاهٍ يطبخ المعكرونة في المطبخ", # Arabic
|
174 |
+
"厨师在厨房煮意大利面", # Chinese
|
175 |
+
"Un chef qui cuisine des pâtes dans la cuisine", # French
|
176 |
+
"Ein Koch, der in der Küche Pasta kocht", # German
|
177 |
+
"Ένας σεφ μαγειρεύει ζυμαρικά στην κουζίνα", # Greek
|
178 |
+
"एक शेफ रसोई में पास्ता पका रहा है", # Hindi
|
179 |
+
"Uno chef che cucina la pasta in cucina", # Italian
|
180 |
+
"シェフがキッチンでパスタを作っている", # Japanese
|
181 |
+
"셰프가 주방에서 파스타를 요리하고 있다", # Korean
|
182 |
]
|
183 |
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
# Public image URLs or Pil
|
186 |
+
image_urls = ["https://i.ibb.co/bRGGJxD/DALL-E-2024-11-20-13-44-46-A-highly-realistic-8-K-photographic-image-of-a-chef-cooking-pasta-in-a-mo.webp"]
|
187 |
# Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
|
188 |
truncate_dim = 512
|
189 |
|
|
|
194 |
) # also accepts PIL.image, local filenames, dataURI
|
195 |
|
196 |
# Encode query text
|
197 |
+
query = "A chef cooking pasta in the kitchen" # English
|
198 |
+
query_embeddings = model.encode_text(
|
199 |
query, task="retrieval.query", truncate_dim=truncate_dim
|
200 |
)
|
201 |
|
202 |
+
# text to image
|
203 |
+
print("En -> Img: " + str(query_embeddings @ image_embeddings[0].T))
|
204 |
+
# text to text
|
205 |
+
print("En -> Ar: " + str(query_embeddings @ text_embeddings[0].T))
|
206 |
+
print("En -> Zh: " + str(query_embeddings @ text_embeddings[1].T))
|
207 |
+
print("En -> Fr: " + str(query_embeddings @ text_embeddings[2].T))
|
208 |
+
print("En -> De: " + str(query_embeddings @ text_embeddings[3].T))
|
209 |
+
print("En -> Gr: " + str(query_embeddings @ text_embeddings[4].T))
|
210 |
+
print("En -> Hi: " + str(query_embeddings @ text_embeddings[5].T))
|
211 |
+
print("En -> It: " + str(query_embeddings @ text_embeddings[6].T))
|
212 |
+
print("En -> Jp: " + str(query_embeddings @ text_embeddings[7].T))
|
213 |
+
print("En -> Ko: " + str(query_embeddings @ text_embeddings[8].T))
|
214 |
```
|
215 |
|
216 |
or via sentence-transformers:
|
|
|
225 |
"jinaai/jina-clip-v2", trust_remote_code=True, truncate_dim=truncate_dim
|
226 |
)
|
227 |
|
228 |
+
# Corpus
|
229 |
sentences = [
|
230 |
+
"طاهٍ يطبخ المعكرونة في المطبخ", # Arabic
|
231 |
+
"厨师在厨房煮意大利面", # Chinese
|
232 |
+
"Un chef qui cuisine des pâtes dans la cuisine", # French
|
233 |
+
"Ein Koch, der in der Küche Pasta kocht", # German
|
234 |
+
"Ένας σεφ μαγειρεύει ζυμαρικά στην κουζίνα", # Greek
|
235 |
+
"एक शेफ रसोई में पास्ता पका रहा है", # Hindi
|
236 |
+
"Uno chef che cucina la pasta in cucina", # Italian
|
237 |
+
"シェフがキッチンでパスタを作っている", # Japanese
|
238 |
+
"셰프가 주방에서 파스타를 요리하고 있다", # Korean
|
239 |
]
|
240 |
|
241 |
+
# Public image URLs or Pil
|
242 |
+
image_urls = ["https://i.ibb.co/bRGGJxD/DALL-E-2024-11-20-13-44-46-A-highly-realistic-8-K-photographic-image-of-a-chef-cooking-pasta-in-a-mo.webp"]
|
|
|
|
|
|
|
243 |
|
244 |
text_embeddings = model.encode(sentences)
|
245 |
image_embeddings = model.encode(image_urls)
|
246 |
+
query = "A chef cooking pasta in the kitchen" # English
|
247 |
+
query_embeddings = model.encode(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
```
|
249 |
|
250 |
|