jinaai
/

jina-clip-v2

Model card Files Files and versions Community

Update README.md

by bwang0911 - opened Nov 20, 2024

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+40

-67

Files changed (1) hide show

README.md +40 -67

README.md CHANGED Viewed

@@ -168,18 +168,22 @@ from transformers import AutoModel
 # Initialize the model
 model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
-# Sentences
 sentences = [
-    "A neural network walks into a bar and forgets why it came.",
-    "Why do programmers prefer dark mode? Because light attracts bugs.",
 ]
-# Public image URLs
-image_urls = [
-    "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg",
-    "https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg",
-]
 # Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
 truncate_dim = 512
@@ -190,16 +194,23 @@ image_embeddings = model.encode_image(
 )  # also accepts PIL.image, local filenames, dataURI
 # Encode query text
-query = "tell me a joke about AI"
-text_query_embeddings = model.encode_text(
     query, task="retrieval.query", truncate_dim=truncate_dim
 )
-# Compute similarities
-print(text_query_embeddings @ text_embeddings[1].T)  # text embedding similarity
-print(text_query_embeddings @ image_embeddings[0].T)  # text-image cross-modal similarity
-print(image_embeddings[0] @ image_embeddings[1].T)  # image-image cross-modal similarity
-print(image_embeddings[0] @ text_embeddings[0].T)  # image-text cross-modal similarity
 ```
 or via sentence-transformers:
@@ -214,64 +225,26 @@ model = SentenceTransformer(
     "jinaai/jina-clip-v2", trust_remote_code=True, truncate_dim=truncate_dim
 )
-# Sentences
 sentences = [
-    "A neural network walks into a bar and forgets why it came.",
-    "Why do programmers prefer dark mode? Because light attracts bugs.",
 ]
-# Public image URLs
-image_urls = [
-    "https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg",
-    "https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg",
-]
 text_embeddings = model.encode(sentences)
 image_embeddings = model.encode(image_urls)
-query = "tell me a joke about AI"
-text_query_embeddings = model.encode(query, prompt_name="retrieval.query")
-```
-JavaScript developers can use Jina CLIP via the [transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
-```js
-import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
-// Load tokenizer and text model
-const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v2');
-const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v2');
-// Load processor and vision model
-const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
-const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v2');
-// Run tokenization
-const texts = [
-    'A neural network walks into a bar and forgets why it came.',
-    'Why do programmers prefer dark mode? Because light attracts bugs.',
-];
-const text_inputs = tokenizer(texts, { padding: true, truncation: true });
-// Compute text embeddings
-const { text_embeds } = await text_model(text_inputs);
-// Read images and run processor
-const urls = [
-    'https://i.pinimg.com/600x315/21/48/7e/21487e8e0970dd366dafaed6ab25d8d8.jpg',
-    'https://i.pinimg.com/736x/c9/f2/3e/c9f23e212529f13f19bad5602d84b78b.jpg'
-];
-const image = await Promise.all(urls.map(url => RawImage.read(url)));
-const image_inputs = await processor(image);
-// Compute vision embeddings
-const { image_embeds } = await vision_model(image_inputs);
-//  Compute similarities
-console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
-console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
-console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
-console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
-console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
 ```

 # Initialize the model
 model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)
+# Corpus
 sentences = [
+    "طاهٍ يطبخ المعكرونة في المطبخ", # Arabic
+    "厨师在厨房煮意大利面", # Chinese
+    "Un chef qui cuisine des pâtes dans la cuisine", # French
+    "Ein Koch, der in der Küche Pasta kocht", # German
+    "Ένας σεφ μαγειρεύει ζυμαρικά στην κουζίνα", # Greek
+    "एक शेफ रसोई में पास्ता पका रहा है", # Hindi
+    "Uno chef che cucina la pasta in cucina", # Italian
+    "シェフがキッチンでパスタを作っている", # Japanese
+    "셰프가 주방에서 파스타를 요리하고 있다", # Korean
 ]
+# Public image URLs or Pil
+image_urls = ["https://i.ibb.co/bRGGJxD/DALL-E-2024-11-20-13-44-46-A-highly-realistic-8-K-photographic-image-of-a-chef-cooking-pasta-in-a-mo.webp"]
 # Choose a matryoshka dimension, set to None to get the full 1024-dim vectors
 truncate_dim = 512
 )  # also accepts PIL.image, local filenames, dataURI
 # Encode query text
+query = "A chef cooking pasta in the kitchen" # English
+query_embeddings = model.encode_text(
     query, task="retrieval.query", truncate_dim=truncate_dim
 )
+# text to image
+print("En -> Img: " + str(query_embeddings @ image_embeddings[0].T))
+# text to text
+print("En -> Ar: " + str(query_embeddings @ text_embeddings[0].T))
+print("En -> Zh: " + str(query_embeddings @ text_embeddings[1].T))
+print("En -> Fr: " + str(query_embeddings @ text_embeddings[2].T))
+print("En -> De: " + str(query_embeddings @ text_embeddings[3].T))
+print("En -> Gr: " + str(query_embeddings @ text_embeddings[4].T))
+print("En -> Hi: " + str(query_embeddings @ text_embeddings[5].T))
+print("En -> It: " + str(query_embeddings @ text_embeddings[6].T))
+print("En -> Jp: " + str(query_embeddings @ text_embeddings[7].T))
+print("En -> Ko: " + str(query_embeddings @ text_embeddings[8].T))
 ```
 or via sentence-transformers:
     "jinaai/jina-clip-v2", trust_remote_code=True, truncate_dim=truncate_dim
 )
+# Corpus
 sentences = [
+    "طاهٍ يطبخ المعكرونة في المطبخ", # Arabic
+    "厨师在厨房煮意大利面", # Chinese
+    "Un chef qui cuisine des pâtes dans la cuisine", # French
+    "Ein Koch, der in der Küche Pasta kocht", # German
+    "Ένας σεφ μαγειρεύει ζυμαρικά στην κουζίνα", # Greek
+    "एक शेफ रसोई में पास्ता पका रहा है", # Hindi
+    "Uno chef che cucina la pasta in cucina", # Italian
+    "シェフがキッチンでパスタを作っている", # Japanese
+    "셰프가 주방에서 파스타를 요리하고 있다", # Korean
 ]
+# Public image URLs or Pil
+image_urls = ["https://i.ibb.co/bRGGJxD/DALL-E-2024-11-20-13-44-46-A-highly-realistic-8-K-photographic-image-of-a-chef-cooking-pasta-in-a-mo.webp"]
 text_embeddings = model.encode(sentences)
 image_embeddings = model.encode(image_urls)
+query = "A chef cooking pasta in the kitchen" # English
+query_embeddings = model.encode(query)
 ```