Update README.md
Browse files
README.md
CHANGED
@@ -8979,5 +8979,132 @@ It can encode both text and image.
|
|
8979 |
|
8980 |
Details is coming soon.
|
8981 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8982 |
## License
|
8983 |
**This model should not be used for any commercial purpose!**
|
|
|
8979 |
|
8980 |
Details is coming soon.
|
8981 |
|
8982 |
+
## Usage
|
8983 |
+
```python
|
8984 |
+
|
8985 |
+
import functools
|
8986 |
+
import PIL
|
8987 |
+
import numpy as np
|
8988 |
+
import torch
|
8989 |
+
from typing import Dict
|
8990 |
+
from io import BytesIO
|
8991 |
+
from transformers import SiglipImageProcessor
|
8992 |
+
from sentence_transformers import SentenceTransformer
|
8993 |
+
|
8994 |
+
|
8995 |
+
def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
|
8996 |
+
trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
|
8997 |
+
if "pixel_values" in features:
|
8998 |
+
trans_features["pixel_values"] = features["pixel_values"]
|
8999 |
+
sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
|
9000 |
+
features.update({"sentence_embedding": sentence_embedding})
|
9001 |
+
return features
|
9002 |
+
|
9003 |
+
|
9004 |
+
def jasper_vl_tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
|
9005 |
+
img_start_token = "<|jasper_img_start|>"
|
9006 |
+
img_token = "<|jasper_img_token|>"
|
9007 |
+
img_end_token = "<|jasper_img_end|>"
|
9008 |
+
num_img_tokens = 300
|
9009 |
+
|
9010 |
+
def process_text_item(item):
|
9011 |
+
if isinstance(item, str):
|
9012 |
+
return item, []
|
9013 |
+
text, images = "", []
|
9014 |
+
for sub_item in item:
|
9015 |
+
if sub_item["type"] == "text":
|
9016 |
+
text += sub_item["content"]
|
9017 |
+
elif sub_item["type"] == "image_bytes":
|
9018 |
+
text += img_start_token + img_token * num_img_tokens + img_end_token
|
9019 |
+
images.append(PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB"))
|
9020 |
+
elif sub_item["type"] == "image_path":
|
9021 |
+
text += img_start_token + img_token * num_img_tokens + img_end_token
|
9022 |
+
images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
|
9023 |
+
else:
|
9024 |
+
raise ValueError(f"unknown data type {sub_item['type']}")
|
9025 |
+
return text, images
|
9026 |
+
|
9027 |
+
all_texts, all_images = [], []
|
9028 |
+
for item in texts:
|
9029 |
+
text, images = process_text_item(item)
|
9030 |
+
all_texts.append(text)
|
9031 |
+
all_images.extend(images)
|
9032 |
+
ipt = self.tokenizer(all_texts, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
|
9033 |
+
if all_images:
|
9034 |
+
ipt["pixel_values"] = self.processor(
|
9035 |
+
images=all_images,
|
9036 |
+
return_tensors="pt"
|
9037 |
+
)["pixel_values"]
|
9038 |
+
# For the sake of demonstration, external variables are used here, please modify the code according to your own environment.
|
9039 |
+
if use_gpu:
|
9040 |
+
ipt["pixel_values"] = ipt["pixel_values"].bfloat16()
|
9041 |
+
return ipt
|
9042 |
+
|
9043 |
+
|
9044 |
+
DOC1 = """
|
9045 |
+
Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
|
9046 |
+
Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
|
9047 |
+
Closer to the horizon, the sky fades to a lighter blue or white.
|
9048 |
+
"""
|
9049 |
+
DOC2 = """
|
9050 |
+
When choosing colors, you can consider the following factors:
|
9051 |
+
Color theory: Understand how colors work together and how they can evoke different reactions.
|
9052 |
+
Color psychology: Consider how colors affect emotions, behaviors, and responses.
|
9053 |
+
Brand identity: Colors can convey meaning and information about a brand.
|
9054 |
+
Mood: Consider the mood you want to create. For example, brighter colors can feel cheerful, while cooler colors can be calming.
|
9055 |
+
Space: Consider the size of the space and the amount of natural light it receives. Dark colors can make a room feel smaller, while light colors can make it feel larger.
|
9056 |
+
Color wheel: Use the color wheel to identify primary, secondary, and tertiary colors.
|
9057 |
+
Color combinations: Decide how to best complement your preferred color with others.
|
9058 |
+
Color palette: Limit your color palette to a main color and one or two additional colors.
|
9059 |
+
60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
|
9060 |
+
"""
|
9061 |
+
prompt_dict = {
|
9062 |
+
"s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
|
9063 |
+
"s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
|
9064 |
+
}
|
9065 |
+
if __name__ == "__main__":
|
9066 |
+
# load model
|
9067 |
+
use_gpu = False
|
9068 |
+
model_name = "infgrad/jasper_en_vision_language_v1"
|
9069 |
+
model = SentenceTransformer(
|
9070 |
+
model_name,
|
9071 |
+
trust_remote_code=True,
|
9072 |
+
device="cpu",
|
9073 |
+
model_kwargs={
|
9074 |
+
"torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
|
9075 |
+
"attn_implementation": "sdpa"
|
9076 |
+
},
|
9077 |
+
# vector_dim must be 12288, 1024, 512, 256
|
9078 |
+
## 1024 is recommended
|
9079 |
+
# set is_text_encoder 'True', if you do not encode image
|
9080 |
+
config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
|
9081 |
+
tokenizer_kwargs={"padding_side": "right"}
|
9082 |
+
)
|
9083 |
+
# jasper model cannot directly be used in SentenceTransformer, do some modifications
|
9084 |
+
model.processor = SiglipImageProcessor.from_pretrained(model_name)
|
9085 |
+
model.tokenize = functools.partial(jasper_vl_tokenize, model)
|
9086 |
+
model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
|
9087 |
+
model.max_seq_length = 1024
|
9088 |
+
# data
|
9089 |
+
q_list = [
|
9090 |
+
"Why the sky is blue?",
|
9091 |
+
"how to choose suitable color",
|
9092 |
+
]
|
9093 |
+
doc_list = [
|
9094 |
+
DOC1,
|
9095 |
+
[{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
|
9096 |
+
DOC2,
|
9097 |
+
[{"type": "image_path", "content": "./assets/img2.png"}],
|
9098 |
+
|
9099 |
+
]
|
9100 |
+
q_vecs = model.encode([prompt_dict["s2p_query"] + text for text in q_list], normalize_embeddings=True)
|
9101 |
+
doc_vecs = model.encode(doc_list, normalize_embeddings=True)
|
9102 |
+
print(np.matmul(q_vecs, doc_vecs.T))
|
9103 |
+
# the output is:
|
9104 |
+
# [[0.777521 0.75944513 0.24291277 0.2187205]
|
9105 |
+
# [0.32261407 0.30536035 0.74208796 0.5484469]]
|
9106 |
+
|
9107 |
+
|
9108 |
+
```
|
9109 |
## License
|
9110 |
**This model should not be used for any commercial purpose!**
|