align data format in demo with model training
Browse files
README.md
CHANGED
@@ -69,38 +69,38 @@ model.eval()
|
|
69 |
|
70 |
# Image + Text -> Text
|
71 |
image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
|
72 |
-
inputs = processor(text='<|image|><|begin_of_text|>Represent the given image with the following question: What is in the image', images=[image], return_tensors="pt").to("cuda")
|
73 |
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
|
74 |
|
75 |
string = 'A cat and a dog'
|
76 |
text_inputs = processor(text=string, return_tensors="pt").to("cuda")
|
77 |
tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
|
78 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
79 |
-
## A cat and a dog = tensor([[0.
|
80 |
|
81 |
string = 'A cat and a tiger'
|
82 |
text_inputs = processor(text=string, return_tensors="pt").to("cuda")
|
83 |
tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
|
84 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
85 |
-
## A cat and a tiger = tensor([[0.
|
86 |
|
87 |
# Text -> Image
|
88 |
-
inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog
|
89 |
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
|
90 |
|
91 |
-
string = '<|image|><|begin_of_text|>Represent the given image
|
92 |
tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
|
93 |
tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
|
94 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
95 |
-
## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.
|
96 |
|
97 |
-
inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger
|
98 |
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
|
99 |
-
string = '<|image|><|begin_of_text|>Represent the given image
|
100 |
tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
|
101 |
tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
|
102 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
103 |
-
## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.
|
104 |
```
|
105 |
|
106 |
### Sentence Transformers
|
|
|
69 |
|
70 |
# Image + Text -> Text
|
71 |
image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
|
72 |
+
inputs = processor(text='<|image|><|begin_of_text|>Represent the given image with the following question: What is in the image\n', images=[image], return_tensors="pt").to("cuda")
|
73 |
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
|
74 |
|
75 |
string = 'A cat and a dog'
|
76 |
text_inputs = processor(text=string, return_tensors="pt").to("cuda")
|
77 |
tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
|
78 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
79 |
+
## A cat and a dog = tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
|
80 |
|
81 |
string = 'A cat and a tiger'
|
82 |
text_inputs = processor(text=string, return_tensors="pt").to("cuda")
|
83 |
tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
|
84 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
85 |
+
## A cat and a tiger = tensor([[0.3184]], device='cuda:0', dtype=torch.bfloat16)
|
86 |
|
87 |
# Text -> Image
|
88 |
+
inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.\n', return_tensors="pt").to("cuda")
|
89 |
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
|
90 |
|
91 |
+
string = '<|image|><|begin_of_text|>Represent the given image.\n'
|
92 |
tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
|
93 |
tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
|
94 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
95 |
+
## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
|
96 |
|
97 |
+
inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.\n', return_tensors="pt").to("cuda")
|
98 |
qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
|
99 |
+
string = '<|image|><|begin_of_text|>Represent the given image.\n'
|
100 |
tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
|
101 |
tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
|
102 |
print(string, '=', compute_similarity(qry_output, tgt_output))
|
103 |
+
## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.3730]], device='cuda:0', dtype=torch.bfloat16)
|
104 |
```
|
105 |
|
106 |
### Sentence Transformers
|