visheratin
commited on
Commit
·
304a0a4
1
Parent(s):
cde656c
Update model files
Browse files- processing_llava.py +7 -2
processing_llava.py
CHANGED
@@ -30,6 +30,7 @@ from transformers.tokenization_utils_base import (
|
|
30 |
from transformers.utils import TensorType
|
31 |
import torch
|
32 |
from open_clip.transform import PreprocessCfg, image_transform_v2
|
|
|
33 |
|
34 |
|
35 |
class OpenCLIPImageProcessor:
|
@@ -67,6 +68,7 @@ class LlavaProcessor:
|
|
67 |
TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
|
68 |
] = None,
|
69 |
images: ImageInput = None,
|
|
|
70 |
padding: Union[bool, str, PaddingStrategy] = False,
|
71 |
truncation: Union[bool, str, TruncationStrategy] = None,
|
72 |
max_length=None,
|
@@ -76,8 +78,11 @@ class LlavaProcessor:
|
|
76 |
pixel_values = self.image_processor(images, return_tensors=return_tensors)[
|
77 |
"pixel_values"
|
78 |
]
|
|
|
|
|
|
|
79 |
else:
|
80 |
-
|
81 |
text_inputs = self.tokenizer(
|
82 |
text,
|
83 |
return_tensors=return_tensors,
|
@@ -86,7 +91,7 @@ class LlavaProcessor:
|
|
86 |
max_length=max_length,
|
87 |
)
|
88 |
|
89 |
-
return BatchFeature(data={**text_inputs, "
|
90 |
|
91 |
def batch_decode(self, *args, **kwargs):
|
92 |
return self.tokenizer.batch_decode(*args, **kwargs)
|
|
|
30 |
from transformers.utils import TensorType
|
31 |
import torch
|
32 |
from open_clip.transform import PreprocessCfg, image_transform_v2
|
33 |
+
from modeling_llava import LlavaForConditionalGeneration
|
34 |
|
35 |
|
36 |
class OpenCLIPImageProcessor:
|
|
|
68 |
TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
|
69 |
] = None,
|
70 |
images: ImageInput = None,
|
71 |
+
model: LlavaForConditionalGeneration = None,
|
72 |
padding: Union[bool, str, PaddingStrategy] = False,
|
73 |
truncation: Union[bool, str, TruncationStrategy] = None,
|
74 |
max_length=None,
|
|
|
78 |
pixel_values = self.image_processor(images, return_tensors=return_tensors)[
|
79 |
"pixel_values"
|
80 |
]
|
81 |
+
pixel_values = pixel_values.to(model.device)
|
82 |
+
image_outputs = model.vision_model(pixel_values)
|
83 |
+
image_features = model.multi_modal_projector(image_outputs)
|
84 |
else:
|
85 |
+
image_features = None
|
86 |
text_inputs = self.tokenizer(
|
87 |
text,
|
88 |
return_tensors=return_tensors,
|
|
|
91 |
max_length=max_length,
|
92 |
)
|
93 |
|
94 |
+
return BatchFeature(data={**text_inputs, "image_features": image_features})
|
95 |
|
96 |
def batch_decode(self, *args, **kwargs):
|
97 |
return self.tokenizer.batch_decode(*args, **kwargs)
|