infgrad
/

jasper_en_vision_language_v1

@@ -8979,5 +8979,132 @@ It can encode both text and image.
 Details is coming soon.
 ## License
 **This model should not be used for any commercial purpose!**

 Details is coming soon.
+## Usage
+```python
+import functools
+import PIL
+import numpy as np
+import torch
+from typing import Dict
+from io import BytesIO
+from transformers import SiglipImageProcessor
+from sentence_transformers import SentenceTransformer
+def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
+    trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
+    if "pixel_values" in features:
+        trans_features["pixel_values"] = features["pixel_values"]
+    sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
+    features.update({"sentence_embedding": sentence_embedding})
+    return features
+def jasper_vl_tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
+    img_start_token = "<|jasper_img_start|>"
+    img_token = "<|jasper_img_token|>"
+    img_end_token = "<|jasper_img_end|>"
+    num_img_tokens = 300
+    def process_text_item(item):
+        if isinstance(item, str):
+            return item, []
+        text, images = "", []
+        for sub_item in item:
+            if sub_item["type"] == "text":
+                text += sub_item["content"]
+            elif sub_item["type"] == "image_bytes":
+                text += img_start_token + img_token * num_img_tokens + img_end_token
+                images.append(PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB"))
+            elif sub_item["type"] == "image_path":
+                text += img_start_token + img_token * num_img_tokens + img_end_token
+                images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
+            else:
+                raise ValueError(f"unknown data type {sub_item['type']}")
+        return text, images
+    all_texts, all_images = [], []
+    for item in texts:
+        text, images = process_text_item(item)
+        all_texts.append(text)
+        all_images.extend(images)
+    ipt = self.tokenizer(all_texts, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
+    if all_images:
+        ipt["pixel_values"] = self.processor(
+            images=all_images,
+            return_tensors="pt"
+        )["pixel_values"]
+        # For the sake of demonstration, external variables are used here, please modify the code according to your own environment.
+        if use_gpu:
+            ipt["pixel_values"] = ipt["pixel_values"].bfloat16()
+    return ipt
+DOC1 = """
+Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
+Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
+Closer to the horizon, the sky fades to a lighter blue or white.
+"""
+DOC2 = """
+When choosing colors, you can consider the following factors:
+Color theory: Understand how colors work together and how they can evoke different reactions.
+Color psychology: Consider how colors affect emotions, behaviors, and responses.
+Brand identity: Colors can convey meaning and information about a brand.
+Mood: Consider the mood you want to create. For example, brighter colors can feel cheerful, while cooler colors can be calming.
+Space: Consider the size of the space and the amount of natural light it receives. Dark colors can make a room feel smaller, while light colors can make it feel larger.
+Color wheel: Use the color wheel to identify primary, secondary, and tertiary colors.
+Color combinations: Decide how to best complement your preferred color with others.
+Color palette: Limit your color palette to a main color and one or two additional colors.
+60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
+"""
+prompt_dict = {
+    "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
+    "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
+}
+if __name__ == "__main__":
+    # load model
+    use_gpu = False
+    model_name = "infgrad/jasper_en_vision_language_v1"
+    model = SentenceTransformer(
+        model_name,
+        trust_remote_code=True,
+        device="cpu",
+        model_kwargs={
+            "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
+            "attn_implementation": "sdpa"
+        },
+        # vector_dim must be 12288, 1024, 512, 256
+        ## 1024 is recommended
+        # set is_text_encoder 'True', if you do not encode image
+        config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
+        tokenizer_kwargs={"padding_side": "right"}
+    )
+    # jasper model cannot directly be used in SentenceTransformer, do some modifications
+    model.processor = SiglipImageProcessor.from_pretrained(model_name)
+    model.tokenize = functools.partial(jasper_vl_tokenize, model)
+    model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
+    model.max_seq_length = 1024
+    # data
+    q_list = [
+        "Why the sky is blue?",
+        "how to choose suitable color",
+    ]
+    doc_list = [
+        DOC1,
+        [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
+        DOC2,
+        [{"type": "image_path", "content": "./assets/img2.png"}],
+    ]
+    q_vecs = model.encode([prompt_dict["s2p_query"] + text for text in q_list], normalize_embeddings=True)
+    doc_vecs = model.encode(doc_list, normalize_embeddings=True)
+    print(np.matmul(q_vecs, doc_vecs.T))
+    # the output is:
+    # [[0.777521   0.75944513 0.24291277 0.2187205]
+    #  [0.32261407 0.30536035 0.74208796 0.5484469]]
+```
 ## License
 **This model should not be used for any commercial purpose!**