Files changed (1) hide show
  1. README.md +40 -67
README.md CHANGED
@@ -168,18 +168,22 @@ from transformers import AutoModel
168
  # Initialize the model
169
  model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
170
 
171
- # Sentences
172
  sentences = [
173
- "A neural network walks into a bar and forgets why it came.",
174
- "Why do programmers prefer dark mode? Because light attracts bugs.",
 
 
 
 
 
 
 
175
  ]
176
 
177
- # Public image URLs
178
- image_urls = [
179
- "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg",
180
- "https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg",
181
- ]
182
 
 
 
183
  # Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
184
  truncate_dim = 512
185
 
@@ -190,16 +194,23 @@ image_embeddings = model.encode_image(
190
  ) # also accepts PIL.image, local filenames, dataURI
191
 
192
  # Encode query text
193
- query = "tell me a joke about AI"
194
- text_query_embeddings = model.encode_text(
195
  query, task="retrieval.query", truncate_dim=truncate_dim
196
  )
197
 
198
- # Compute similarities
199
- print(text_query_embeddings @ text_embeddings[1].T) # text embedding similarity
200
- print(text_query_embeddings @ image_embeddings[0].T) # text-image cross-modal similarity
201
- print(image_embeddings[0] @ image_embeddings[1].T) # image-image cross-modal similarity
202
- print(image_embeddings[0] @ text_embeddings[0].T) # image-text cross-modal similarity
 
 
 
 
 
 
 
203
  ```
204
 
205
  or via sentence-transformers:
@@ -214,64 +225,26 @@ model = SentenceTransformer(
214
  "jinaai/jina-clip-v2", trust_remote_code=True, truncate_dim=truncate_dim
215
  )
216
 
217
- # Sentences
218
  sentences = [
219
- "A neural network walks into a bar and forgets why it came.",
220
- "Why do programmers prefer dark mode? Because light attracts bugs.",
 
 
 
 
 
 
 
221
  ]
222
 
223
- # Public image URLs
224
- image_urls = [
225
- "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg",
226
- "https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg",
227
- ]
228
 
229
  text_embeddings = model.encode(sentences)
230
  image_embeddings = model.encode(image_urls)
231
- query = "tell me a joke about AI"
232
- text_query_embeddings = model.encode(query, prompt_name="retrieval.query")
233
- ```
234
-
235
- JavaScript developers can use Jina CLIP via the [transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
236
-
237
- ```js
238
- import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
239
-
240
- // Load tokenizer and text model
241
- const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v2');
242
- const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v2');
243
-
244
- // Load processor and vision model
245
- const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
246
- const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v2');
247
-
248
- // Run tokenization
249
- const texts = [
250
- 'A neural network walks into a bar and forgets why it came.',
251
- 'Why do programmers prefer dark mode? Because light attracts bugs.',
252
- ];
253
- const text_inputs = tokenizer(texts, { padding: true, truncation: true });
254
-
255
- // Compute text embeddings
256
- const { text_embeds } = await text_model(text_inputs);
257
-
258
- // Read images and run processor
259
- const urls = [
260
- 'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
261
- 'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
262
- ];
263
- const image = await Promise.all(urls.map(url => RawImage.read(url)));
264
- const image_inputs = await processor(image);
265
-
266
- // Compute vision embeddings
267
- const { image_embeds } = await vision_model(image_inputs);
268
-
269
- // Compute similarities
270
- console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
271
- console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
272
- console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
273
- console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
274
- console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
275
  ```
276
 
277
 
 
168
  # Initialize the model
169
  model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
170
 
171
+ # Corpus
172
  sentences = [
173
+ "طاهٍ يطبخ المعكرونة في المطبخ", # Arabic
174
+ "厨师在厨房煮意大利面", # Chinese
175
+ "Un chef qui cuisine des pâtes dans la cuisine", # French
176
+ "Ein Koch, der in der Küche Pasta kocht", # German
177
+ "Ένας σεφ μαγειρεύει ζυμαρικά στην κουζίνα", # Greek
178
+ "एक शेफ रसोई में पास्ता पका रहा है", # Hindi
179
+ "Uno chef che cucina la pasta in cucina", # Italian
180
+ "シェフがキッチンでパスタを作っている", # Japanese
181
+ "셰프가 주방에서 파스타를 요리하고 있다", # Korean
182
  ]
183
 
 
 
 
 
 
184
 
185
+ # Public image URLs or Pil
186
+ image_urls = ["https://i.ibb.co/bRGGJxD/DALL-E-2024-11-20-13-44-46-A-highly-realistic-8-K-photographic-image-of-a-chef-cooking-pasta-in-a-mo.webp"]
187
  # Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
188
  truncate_dim = 512
189
 
 
194
  ) # also accepts PIL.image, local filenames, dataURI
195
 
196
  # Encode query text
197
+ query = "A chef cooking pasta in the kitchen" # English
198
+ query_embeddings = model.encode_text(
199
  query, task="retrieval.query", truncate_dim=truncate_dim
200
  )
201
 
202
+ # text to image
203
+ print("En -> Img: " + str(query_embeddings @ image_embeddings[0].T))
204
+ # text to text
205
+ print("En -> Ar: " + str(query_embeddings @ text_embeddings[0].T))
206
+ print("En -> Zh: " + str(query_embeddings @ text_embeddings[1].T))
207
+ print("En -> Fr: " + str(query_embeddings @ text_embeddings[2].T))
208
+ print("En -> De: " + str(query_embeddings @ text_embeddings[3].T))
209
+ print("En -> Gr: " + str(query_embeddings @ text_embeddings[4].T))
210
+ print("En -> Hi: " + str(query_embeddings @ text_embeddings[5].T))
211
+ print("En -> It: " + str(query_embeddings @ text_embeddings[6].T))
212
+ print("En -> Jp: " + str(query_embeddings @ text_embeddings[7].T))
213
+ print("En -> Ko: " + str(query_embeddings @ text_embeddings[8].T))
214
  ```
215
 
216
  or via sentence-transformers:
 
225
  "jinaai/jina-clip-v2", trust_remote_code=True, truncate_dim=truncate_dim
226
  )
227
 
228
+ # Corpus
229
  sentences = [
230
+ "طاهٍ يطبخ المعكرونة في المطبخ", # Arabic
231
+ "厨师在厨房煮意大利面", # Chinese
232
+ "Un chef qui cuisine des pâtes dans la cuisine", # French
233
+ "Ein Koch, der in der Küche Pasta kocht", # German
234
+ "Ένας σεφ μαγειρεύει ζυμαρικά στην κουζίνα", # Greek
235
+ "एक शेफ रसोई में पास्ता पका रहा है", # Hindi
236
+ "Uno chef che cucina la pasta in cucina", # Italian
237
+ "シェフがキッチンでパスタを作っている", # Japanese
238
+ "셰프가 주방에서 파스타를 요리하고 있다", # Korean
239
  ]
240
 
241
+ # Public image URLs or Pil
242
+ image_urls = ["https://i.ibb.co/bRGGJxD/DALL-E-2024-11-20-13-44-46-A-highly-realistic-8-K-photographic-image-of-a-chef-cooking-pasta-in-a-mo.webp"]
 
 
 
243
 
244
  text_embeddings = model.encode(sentences)
245
  image_embeddings = model.encode(image_urls)
246
+ query = "A chef cooking pasta in the kitchen" # English
247
+ query_embeddings = model.encode(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  ```
249
 
250