Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

demo_test.py +24 -3
modeling_llavanext_for_embedding.py +71 -0

demo_test.py CHANGED Viewed

@@ -11,13 +11,34 @@
 # print(outputs)
 from transformers import LlavaNextProcessor, AutoModel
 model = AutoModel.from_pretrained("/share/junjie/code/VISTA2/240920mllmemb/llm_dense_retriever/MMRet-release/MMRet-MLLM", trust_remote_code=True).cuda()
 processor = LlavaNextProcessor.from_pretrained("/share/junjie/code/VISTA2/240920mllmemb/llm_dense_retriever/MMRet-release/MMRet-MLLM")
-texts = "find a image of a dog"
 inputs = processor(texts, return_tensors="pt").to("cuda")
-outputs = model(**inputs)
-print(outputs)

 # print(outputs)
+import torch
 from transformers import LlavaNextProcessor, AutoModel
 model = AutoModel.from_pretrained("/share/junjie/code/VISTA2/240920mllmemb/llm_dense_retriever/MMRet-release/MMRet-MLLM", trust_remote_code=True).cuda()
+model = model.eval()
 processor = LlavaNextProcessor.from_pretrained("/share/junjie/code/VISTA2/240920mllmemb/llm_dense_retriever/MMRet-release/MMRet-MLLM")
+texts = "[INST] \n <instruct>  <query> find a image of a dog \n [/INST]"
 inputs = processor(texts, return_tensors="pt").to("cuda")
+outputs = model(**inputs)[:, -1, :]
+embeddings = torch.nn.functional.normalize(outputs, dim=-1)
+print(embeddings)
+from transformers import LlavaNextProcessor, AutoModel
+import torch
+model_name = "/share/junjie/code/VISTA2/240920mllmemb/llm_dense_retriever/MMRet-release/MMRet-MLLM"
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
+model = model.eval()
+model.set_processor(model_name)
+inputs = model.data_process(text="find a image of a dog", q_or_c="query")
+model_output = model(**inputs, output_hidden_states=True)
+embeddings = model_output[:, -1, :]
+embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
+print(embeddings)

modeling_llavanext_for_embedding.py CHANGED Viewed

	@@ -257,3 +257,74 @@ class LLaVANextForEmbedding(LlavaNextForConditionalGeneration):
257
258	return outputs
259

         return outputs
+    def set_processor(self, model_name):
+        self.processor = LlavaNextProcessor.from_pretrained(model_name)
+    def prepare_text_input(self, image=None, text=None, q_or_c=None, task_instruction=None):
+        task_instruction_example_cir = "Retrieve the target image that best meets the combined criteria by using both the provided image and the image retrieval instructions: "
+        assert q_or_c in ["query", "candidate", "q", "c"]
+        if "q" in q_or_c:
+            if task_instruction is None:
+                text_input = "[INST] \n <instruct>  <query>"
+                print(f"""
+                        Warning: For optimal performance, MMRet-MLLM requires the task instruction to be specified in the query.
+                        For example, for the composed image retrieval task, you might use a specific instruction like: {task_instruction_example_cir}.
+                        Instructions for other tasks can be referenced in the MMEB benchmark.
+                    """)
+            elif task_instruction is not None:
+                text_input = f"[INST] \n <instruct> {task_instruction} <query> "
+            if text is not None:
+                text_input = f"{text_input} {text} \n"
+            if image is not None:
+                text_input = f"{text_input} <image>"
+            text_input = f"{text_input} [/INST]"
+        else:
+            text_input = "[INST] "
+            if text is not None:
+                text_input = f"{text_input} {text} \n"
+            if image is not None:
+                text_input = f"{text_input} <image>"
+            text_input = f"{text_input} [/INST]"
+        return text_input
+    def data_process(self, images=None, text=None, q_or_c=None, task_instruction=None):
+        if images is not None:
+            _is_list = isinstance(images, list)
+        elif text is not None:
+            _is_list = isinstance(text, list)
+        else:
+            raise ValueError("images and text cannot be both None.")
+        assert q_or_c in ["query", "candidate", "q", "c"]
+        if not _is_list :
+            text_input = self.prepare_text_input(images, text, q_or_c, task_instruction)
+            text_input = [text_input]
+            print(text_input)
+            if images is not None:
+                images = Image.open(images).resize((512,512)).convert("RGB")
+                images = [images]
+                inputs = self.processor(images=images, text=text_input, return_tensors="pt", padding=True)
+            else:
+                inputs = self.processor(text=text_input, return_tensors="pt", padding=True)
+        else:
+            text_input = [self.prepare_text_input(_image, _text, q_or_c, task_instruction) for _image, _text in zip(images, text)]
+            print(text_input)
+            if images is not None:
+                images = [Image.open(_image).resize((512,512)).convert("RGB") for _image in images]
+                inputs = self.processor(images=images, text=text_input, return_tensors="pt", padding=True)
+            else:
+                inputs = self.processor(text=text_input, return_tensors="pt", padding=True)
+        inputs = inputs.to(self.device)
+        return inputs