Haon-Chen commited on
Commit
e990ec5
·
1 Parent(s): 03bac29

align data format in demo with model training

Browse files
Files changed (1) hide show
  1. README.md +9 -9
README.md CHANGED
@@ -69,38 +69,38 @@ model.eval()
69
 
70
  # Image + Text -> Text
71
  image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
72
- inputs = processor(text='<|image|><|begin_of_text|>Represent the given image with the following question: What is in the image', images=[image], return_tensors="pt").to("cuda")
73
  qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
74
 
75
  string = 'A cat and a dog'
76
  text_inputs = processor(text=string, return_tensors="pt").to("cuda")
77
  tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
78
  print(string, '=', compute_similarity(qry_output, tgt_output))
79
- ## A cat and a dog = tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
80
 
81
  string = 'A cat and a tiger'
82
  text_inputs = processor(text=string, return_tensors="pt").to("cuda")
83
  tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
84
  print(string, '=', compute_similarity(qry_output, tgt_output))
85
- ## A cat and a tiger = tensor([[0.3105]], device='cuda:0', dtype=torch.bfloat16)
86
 
87
  # Text -> Image
88
- inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.', return_tensors="pt").to("cuda")
89
  qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
90
 
91
- string = '<|image|><|begin_of_text|>Represent the given image.'
92
  tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
93
  tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
94
  print(string, '=', compute_similarity(qry_output, tgt_output))
95
- ## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
96
 
97
- inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.', return_tensors="pt").to("cuda")
98
  qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
99
- string = '<|image|><|begin_of_text|>Represent the given image.'
100
  tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
101
  tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
102
  print(string, '=', compute_similarity(qry_output, tgt_output))
103
- ## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.3770]], device='cuda:0', dtype=torch.bfloat16)
104
  ```
105
 
106
  ### Sentence Transformers
 
69
 
70
  # Image + Text -> Text
71
  image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
72
+ inputs = processor(text='<|image|><|begin_of_text|>Represent the given image with the following question: What is in the image\n', images=[image], return_tensors="pt").to("cuda")
73
  qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
74
 
75
  string = 'A cat and a dog'
76
  text_inputs = processor(text=string, return_tensors="pt").to("cuda")
77
  tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
78
  print(string, '=', compute_similarity(qry_output, tgt_output))
79
+ ## A cat and a dog = tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
80
 
81
  string = 'A cat and a tiger'
82
  text_inputs = processor(text=string, return_tensors="pt").to("cuda")
83
  tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
84
  print(string, '=', compute_similarity(qry_output, tgt_output))
85
+ ## A cat and a tiger = tensor([[0.3184]], device='cuda:0', dtype=torch.bfloat16)
86
 
87
  # Text -> Image
88
+ inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.\n', return_tensors="pt").to("cuda")
89
  qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
90
 
91
+ string = '<|image|><|begin_of_text|>Represent the given image.\n'
92
  tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
93
  tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
94
  print(string, '=', compute_similarity(qry_output, tgt_output))
95
+ ## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
96
 
97
+ inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.\n', return_tensors="pt").to("cuda")
98
  qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
99
+ string = '<|image|><|begin_of_text|>Represent the given image.\n'
100
  tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
101
  tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
102
  print(string, '=', compute_similarity(qry_output, tgt_output))
103
+ ## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.3730]], device='cuda:0', dtype=torch.bfloat16)
104
  ```
105
 
106
  ### Sentence Transformers