jinaai
/

jina-clip-v2

Model card Files Files and versions Community

gmastrapas commited on Nov 19

Commit

5695d43

•

1 Parent(s): 3103208

feat: add option assume_text_inputs in sentence transformers

Browse files

Files changed (1) hide show

custom_st.py +37 -19

custom_st.py CHANGED Viewed

@@ -22,6 +22,7 @@ class Transformer(nn.Module):
         model_args: Optional[Dict[str, Any]] = None,
         tokenizer_args: Optional[Dict[str, Any]] = None,
         image_processor_args: Optional[Dict[str, Any]] = None,
         cache_dir: Optional[str] = None,
         backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
         **_,
@@ -56,6 +57,8 @@ class Transformer(nn.Module):
             image_processor_args (Dict[str, Any], optional): Additional image processor
                 configuration parameters to be passed to the Hugging Face Transformers
                 image processor
             cache_dir (str, optional): The Hugging Face Hub cache directory
             backend (str, optional): Computational backend, only 'torch' is supported
@@ -119,6 +122,7 @@ class Transformer(nn.Module):
             cache_dir=cache_dir,
             **image_processor_kwargs,
         )
         # No max_seq_length set. Try to infer from model
         if max_seq_length is None:
@@ -151,26 +155,40 @@ class Transformer(nn.Module):
         _images = []
         _texts = []
         _image_or_text_descriptors = []
-        for sample in texts:
-            if isinstance(sample, str):
-                if sample.startswith('http'):
-                    response = requests.get(sample)
-                    _images.append(Image.open(BytesIO(response.content)).convert('RGB'))
-                    _image_or_text_descriptors.append(0)
-                elif sample.startswith('data:image/'):
-                    _images.append(self._decode_data_image(sample).convert('RGB'))
-                    _image_or_text_descriptors.append(0)
-                else:
-                    try:
-                        _images.append(Image.open(sample).convert('RGB'))
                         _image_or_text_descriptors.append(0)
-                    except Exception as e:
-                        _ = str(e)
-                        _texts.append(sample)
-                        _image_or_text_descriptors.append(1)
-            elif isinstance(sample, Image.Image):
-                _images.append(sample.convert('RGB'))
-                _image_or_text_descriptors.append(0)
         encoding = {}
         if len(_texts):

         model_args: Optional[Dict[str, Any]] = None,
         tokenizer_args: Optional[Dict[str, Any]] = None,
         image_processor_args: Optional[Dict[str, Any]] = None,
+        assume_text_inputs: bool = False,
         cache_dir: Optional[str] = None,
         backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
         **_,
             image_processor_args (Dict[str, Any], optional): Additional image processor
                 configuration parameters to be passed to the Hugging Face Transformers
                 image processor
+            assume_text_inputs (bool, optional): If set to `True`, all inputs are
+                treated as texts. Defaults to `False`
             cache_dir (str, optional): The Hugging Face Hub cache directory
             backend (str, optional): Computational backend, only 'torch' is supported
             cache_dir=cache_dir,
             **image_processor_kwargs,
         )
+        self.assume_text_inputs = assume_text_inputs
         # No max_seq_length set. Try to infer from model
         if max_seq_length is None:
         _images = []
         _texts = []
         _image_or_text_descriptors = []
+        if self.assume_text_inputs:
+            for sample in texts:
+                if isinstance(sample, str):
+                    _texts.append(sample)
+                    _image_or_text_descriptors.append(1)
+        else:
+            for sample in texts:
+                if isinstance(sample, str):
+                    if sample.startswith('http'):
+                        try:
+                            response = requests.get(sample)
+                            _images.append(
+                                Image.open(BytesIO(response.content)).convert('RGB')
+                            )
+                            _image_or_text_descriptors.append(0)
+                        except Exception as e:
+                            _ = str(e)
+                            _texts.append(sample)
+                            _image_or_text_descriptors.append(1)
+                    elif sample.startswith('data:image/'):
+                        _images.append(self._decode_data_image(sample).convert('RGB'))
                         _image_or_text_descriptors.append(0)
+                    else:
+                        try:
+                            _images.append(Image.open(sample).convert('RGB'))
+                            _image_or_text_descriptors.append(0)
+                        except Exception as e:
+                            _ = str(e)
+                            _texts.append(sample)
+                            _image_or_text_descriptors.append(1)
+                elif isinstance(sample, Image.Image):
+                    _images.append(sample.convert('RGB'))
+                    _image_or_text_descriptors.append(0)
         encoding = {}
         if len(_texts):