infgrad commited on
Commit
f0ba2ef
·
verified ·
1 Parent(s): afce9e0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +127 -0
README.md CHANGED
@@ -8979,5 +8979,132 @@ It can encode both text and image.
8979
 
8980
  Details is coming soon.
8981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8982
  ## License
8983
  **This model should not be used for any commercial purpose!**
 
8979
 
8980
  Details is coming soon.
8981
 
8982
+ ## Usage
8983
+ ```python
8984
+
8985
+ import functools
8986
+ import PIL
8987
+ import numpy as np
8988
+ import torch
8989
+ from typing import Dict
8990
+ from io import BytesIO
8991
+ from transformers import SiglipImageProcessor
8992
+ from sentence_transformers import SentenceTransformer
8993
+
8994
+
8995
+ def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
8996
+ trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
8997
+ if "pixel_values" in features:
8998
+ trans_features["pixel_values"] = features["pixel_values"]
8999
+ sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
9000
+ features.update({"sentence_embedding": sentence_embedding})
9001
+ return features
9002
+
9003
+
9004
+ def jasper_vl_tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
9005
+ img_start_token = "<|jasper_img_start|>"
9006
+ img_token = "<|jasper_img_token|>"
9007
+ img_end_token = "<|jasper_img_end|>"
9008
+ num_img_tokens = 300
9009
+
9010
+ def process_text_item(item):
9011
+ if isinstance(item, str):
9012
+ return item, []
9013
+ text, images = "", []
9014
+ for sub_item in item:
9015
+ if sub_item["type"] == "text":
9016
+ text += sub_item["content"]
9017
+ elif sub_item["type"] == "image_bytes":
9018
+ text += img_start_token + img_token * num_img_tokens + img_end_token
9019
+ images.append(PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB"))
9020
+ elif sub_item["type"] == "image_path":
9021
+ text += img_start_token + img_token * num_img_tokens + img_end_token
9022
+ images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
9023
+ else:
9024
+ raise ValueError(f"unknown data type {sub_item['type']}")
9025
+ return text, images
9026
+
9027
+ all_texts, all_images = [], []
9028
+ for item in texts:
9029
+ text, images = process_text_item(item)
9030
+ all_texts.append(text)
9031
+ all_images.extend(images)
9032
+ ipt = self.tokenizer(all_texts, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
9033
+ if all_images:
9034
+ ipt["pixel_values"] = self.processor(
9035
+ images=all_images,
9036
+ return_tensors="pt"
9037
+ )["pixel_values"]
9038
+ # For the sake of demonstration, external variables are used here, please modify the code according to your own environment.
9039
+ if use_gpu:
9040
+ ipt["pixel_values"] = ipt["pixel_values"].bfloat16()
9041
+ return ipt
9042
+
9043
+
9044
+ DOC1 = """
9045
+ Blue light is scattered in all directions by the tiny molecules of air in Earth's atmosphere.
9046
+ Blue is scattered more than other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time.
9047
+ Closer to the horizon, the sky fades to a lighter blue or white.
9048
+ """
9049
+ DOC2 = """
9050
+ When choosing colors, you can consider the following factors:
9051
+ Color theory: Understand how colors work together and how they can evoke different reactions.
9052
+ Color psychology: Consider how colors affect emotions, behaviors, and responses.
9053
+ Brand identity: Colors can convey meaning and information about a brand.
9054
+ Mood: Consider the mood you want to create. For example, brighter colors can feel cheerful, while cooler colors can be calming.
9055
+ Space: Consider the size of the space and the amount of natural light it receives. Dark colors can make a room feel smaller, while light colors can make it feel larger.
9056
+ Color wheel: Use the color wheel to identify primary, secondary, and tertiary colors.
9057
+ Color combinations: Decide how to best complement your preferred color with others.
9058
+ Color palette: Limit your color palette to a main color and one or two additional colors.
9059
+ 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
9060
+ """
9061
+ prompt_dict = {
9062
+ "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
9063
+ "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
9064
+ }
9065
+ if __name__ == "__main__":
9066
+ # load model
9067
+ use_gpu = False
9068
+ model_name = "infgrad/jasper_en_vision_language_v1"
9069
+ model = SentenceTransformer(
9070
+ model_name,
9071
+ trust_remote_code=True,
9072
+ device="cpu",
9073
+ model_kwargs={
9074
+ "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
9075
+ "attn_implementation": "sdpa"
9076
+ },
9077
+ # vector_dim must be 12288, 1024, 512, 256
9078
+ ## 1024 is recommended
9079
+ # set is_text_encoder 'True', if you do not encode image
9080
+ config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
9081
+ tokenizer_kwargs={"padding_side": "right"}
9082
+ )
9083
+ # jasper model cannot directly be used in SentenceTransformer, do some modifications
9084
+ model.processor = SiglipImageProcessor.from_pretrained(model_name)
9085
+ model.tokenize = functools.partial(jasper_vl_tokenize, model)
9086
+ model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
9087
+ model.max_seq_length = 1024
9088
+ # data
9089
+ q_list = [
9090
+ "Why the sky is blue?",
9091
+ "how to choose suitable color",
9092
+ ]
9093
+ doc_list = [
9094
+ DOC1,
9095
+ [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
9096
+ DOC2,
9097
+ [{"type": "image_path", "content": "./assets/img2.png"}],
9098
+
9099
+ ]
9100
+ q_vecs = model.encode([prompt_dict["s2p_query"] + text for text in q_list], normalize_embeddings=True)
9101
+ doc_vecs = model.encode(doc_list, normalize_embeddings=True)
9102
+ print(np.matmul(q_vecs, doc_vecs.T))
9103
+ # the output is:
9104
+ # [[0.777521 0.75944513 0.24291277 0.2187205]
9105
+ # [0.32261407 0.30536035 0.74208796 0.5484469]]
9106
+
9107
+
9108
+ ```
9109
  ## License
9110
  **This model should not be used for any commercial purpose!**