intfloat
/

mmE5-mllama-11b-instruct

@@ -1,38 +1,43 @@
 from typing import Any, Dict, Optional, List
 import torch
 from PIL import Image
 from transformers import AutoProcessor, MllamaForConditionalGeneration
 from sentence_transformers.models import Transformer as BaseTransformer
 class MultiModalTransformer(BaseTransformer):
     def __init__(
-        self,
-        model_name_or_path: str,
-        cache_dir: Optional[str] = None,
-        tokenizer_args: Optional[Dict[str, Any]] = None,
-        **kwargs,
     ):
         super().__init__(model_name_or_path, **kwargs)
         if tokenizer_args is None:
             tokenizer_args = {}
-        # Initialize processor and set padding side
         self.processor = AutoProcessor.from_pretrained(
             model_name_or_path, cache_dir=cache_dir, **tokenizer_args
         )
-        # Configure model settings
-        config = self.auto_model.config
-        if hasattr(config, 'use_cache'):
-            config.use_cache = False
-        padding_side = "right"
-        self.processor.tokenizer.padding_side = padding_side
-        config.padding_side = padding_side
-        self.auto_model.padding_side = padding_side
     def forward(
-        self, features: Dict[str, torch.Tensor], **kwargs
     ) -> Dict[str, torch.Tensor]:
         # Process inputs through the model
         outputs = self.auto_model(
@@ -41,12 +46,12 @@ class MultiModalTransformer(BaseTransformer):
             output_hidden_states=True,
             **kwargs
         )
         # Apply last pooling and normalization
         last_hidden_state = outputs.hidden_states[-1]
         attention_mask = features["attention_mask"]
         sentence_embedding = self._last_pooling(last_hidden_state, attention_mask)
         features.update({"sentence_embedding": sentence_embedding})
         return features
@@ -57,11 +62,11 @@ class MultiModalTransformer(BaseTransformer):
         reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths]
         return torch.nn.functional.normalize(reps, p=2, dim=-1)
-    def tokenize(self, texts: List[Dict] | List[str]) -> Dict[str, torch.Tensor]:
         def process_text_item(item):
             if isinstance(item, str):
                 return item, []
             text, images = "", []
             for sub_item in item:
                 if sub_item["type"] == "text":
@@ -101,5 +106,5 @@ class MultiModalTransformer(BaseTransformer):
                 max_length=self.max_seq_length,
                 return_tensors="pt"
             )
-        return inputs

+from io import BytesIO
 from typing import Any, Dict, Optional, List
 import torch
 from PIL import Image
 from transformers import AutoProcessor, MllamaForConditionalGeneration
 from sentence_transformers.models import Transformer as BaseTransformer
 class MultiModalTransformer(BaseTransformer):
     def __init__(
+            self,
+            model_name_or_path: str,
+            cache_dir: Optional[str] = None,
+            tokenizer_args: Optional[Dict[str, Any]] = None,
+            **kwargs,
     ):
         super().__init__(model_name_or_path, **kwargs)
         if tokenizer_args is None:
             tokenizer_args = {}
+        # Initialize processor
         self.processor = AutoProcessor.from_pretrained(
             model_name_or_path, cache_dir=cache_dir, **tokenizer_args
         )
+    def _load_model(
+            self,
+            model_name_or_path: str,
+            config,
+            cache_dir: str,
+            backend: str,
+            is_peft_model: bool,
+            **model_args,
+    ) -> None:
+        self.auto_model = MllamaForConditionalGeneration.from_pretrained(
+            model_name_or_path, torch_dtype=torch.bfloat16, cache_dir=cache_dir, **model_args
+        )
     def forward(
+            self, features: Dict[str, torch.Tensor], **kwargs
     ) -> Dict[str, torch.Tensor]:
         # Process inputs through the model
         outputs = self.auto_model(
             output_hidden_states=True,
             **kwargs
         )
         # Apply last pooling and normalization
         last_hidden_state = outputs.hidden_states[-1]
         attention_mask = features["attention_mask"]
         sentence_embedding = self._last_pooling(last_hidden_state, attention_mask)
         features.update({"sentence_embedding": sentence_embedding})
         return features
         reps = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths]
         return torch.nn.functional.normalize(reps, p=2, dim=-1)
+    def tokenize(self, texts: List[List[Dict]] | List[str]) -> Dict[str, torch.Tensor]:
         def process_text_item(item):
             if isinstance(item, str):
                 return item, []
             text, images = "", []
             for sub_item in item:
                 if sub_item["type"] == "text":
                 max_length=self.max_seq_length,
                 return_tensors="pt"
             )
+        return inputs