doberst commited on
Commit
b3d87a1
1 Parent(s): 05d4e81

Delete ov_mllama_generator_class.py

Browse files
Files changed (1) hide show
  1. ov_mllama_generator_class.py +0 -518
ov_mllama_generator_class.py DELETED
@@ -1,518 +0,0 @@
1
-
2
- """ Core wrapper patching class on mllama-11b OV - excludes all conversion components - and is only for inference.
3
-
4
- -- Generation loop flows through GenerationMixin - will need to remove torch + transformers
5
- """
6
-
7
- from pathlib import Path
8
- from transformers import AutoConfig, GenerationConfig
9
-
10
- from typing import Optional, Union, List, Tuple, Dict
11
- from transformers.generation import GenerationMixin
12
- from transformers.modeling_outputs import ModelOutput
13
- import openvino.runtime.opset13 as ops
14
- import openvino as ov
15
- import torch
16
- import numpy as np
17
- from dataclasses import dataclass
18
- from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
19
- import time
20
-
21
- core = ov.Core()
22
-
23
- LANGUAGE_MODEL = "llm_int4_asym_r10_gs64_max_activation_variance_scale_all_layers.xml"
24
- IMAGE_ENCODER = "openvino_vision_encoder_int8.xml"
25
-
26
- @dataclass
27
- class MLlamaOutputWithPast(ModelOutput):
28
- loss: Optional[torch.FloatTensor] = None
29
- logits: torch.FloatTensor = None
30
- past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
31
- hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
32
- attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
33
- cross_attn_key_values: Optional[List[torch.FloatTensor]] = None
34
-
35
-
36
- class InsertSlice(MatcherPass):
37
- def __init__(self):
38
- MatcherPass.__init__(self)
39
- self.model_changed = False
40
-
41
- param = WrapType("opset10.Result")
42
-
43
- def callback(matcher: Matcher) -> bool:
44
- root = matcher.get_match_root()
45
- if root is None:
46
- return False
47
- if len(root.get_output_partial_shape(0)) == 3:
48
- parent = root.input_value(0).get_node()
49
- grand_parent = parent.input_value(0).get_node()
50
-
51
- grand_parent_output = parent.input(0).get_source_output()
52
- consumers = grand_parent_output.get_target_inputs()
53
- start = np.array([0, -1, 0], dtype=np.int32)
54
- stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32)
55
- step = np.array([1, -1, 1], dtype=np.int32)
56
- axes = np.array([0, 1, 2], dtype=np.int32)
57
- slice = ops.slice(grand_parent, start, stop, step, axes, name="inserted_slice")
58
- for consumer in consumers:
59
- consumer.replace_source_output(slice.output(0))
60
- self.model_changed = True
61
- # Use new operation for additional matching
62
- self.register_new_node(slice)
63
- print("applied slice for lm head")
64
-
65
- return True
66
-
67
- self.register_matcher(Matcher(param, "InsertSlice"), callback)
68
-
69
-
70
- STR_TO_OV_TYPE = {
71
- "boolean": ov.Type.boolean,
72
- "f16": ov.Type.f16,
73
- "f32": ov.Type.f32,
74
- "f64": ov.Type.f64,
75
- "i8": ov.Type.i8,
76
- "i16": ov.Type.i16,
77
- "i32": ov.Type.i32,
78
- "i64": ov.Type.i64,
79
- "u8": ov.Type.u8,
80
- "u16": ov.Type.u16,
81
- "u32": ov.Type.u32,
82
- "u64": ov.Type.u64,
83
- "bf16": ov.Type.bf16,
84
- }
85
-
86
-
87
- class OVMLlamaForConditionalGeneration(GenerationMixin):
88
- def __init__(
89
- self,
90
- model_dir: Union[str, Path],
91
- device: str = "CPU",
92
- ov_config: Optional[Dict[str, str]] = None,
93
- language_model_name=None,
94
- image_encoder_name=None,
95
- slice_lm_head=True,
96
- use_remote_tensors=True,
97
- dynamic_shape=False,
98
- ):
99
- model_dir = Path(model_dir)
100
- self.config = AutoConfig.from_pretrained(model_dir)
101
- self.generation_config = GenerationConfig.from_pretrained(model_dir)
102
- self.main_input_name = "input_ids"
103
- self.device = torch.device("cpu")
104
- self._device = device
105
- self.ov_config = ov_config
106
- self.num_pkv = 2
107
- self._supports_cache_class = False
108
- self.next_beam_idx = None
109
- self._past_length = None
110
- if language_model_name:
111
- self.model = core.read_model(model_dir / language_model_name)
112
- else:
113
- self.model = core.read_model(model_dir / LANGUAGE_MODEL)
114
- if image_encoder_name:
115
- self.vision_model = core.read_model(model_dir / image_encoder_name)
116
- else:
117
- self.vision_model = core.read_model(model_dir / IMAGE_ENCODER)
118
- if not dynamic_shape:
119
- self.reshape_vision_model()
120
- self.update_pkv_precision()
121
- if slice_lm_head:
122
- self.slice_lm_head()
123
- self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
124
- self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
125
- self.lm_cross_attn_inputs = [key for key in self.input_names if "cross_attn_key_values" in key]
126
- compiled_model = core.compile_model(self.model, device, ov_config)
127
- self.request = compiled_model.create_infer_request()
128
- self.cross_attn_outputs = [key.get_any_name() for key in self.vision_model.outputs if "cross_attn_key_values" in key.get_any_name()]
129
- compiled_vision_model = core.compile_model(self.vision_model, device, ov_config)
130
- self.vision_request = compiled_vision_model.create_infer_request()
131
- self.use_remote_tensors = use_remote_tensors and self._device == "GPU"
132
- if self.use_remote_tensors:
133
- self.prepare_remote_tensors()
134
- self.next_beam_idx = None
135
- self.num_patches = (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2 + 1
136
- self._past_length = 0
137
- self.llm_infer_time = []
138
- self.vision_encoder_infer_time = []
139
-
140
- def _get_past_length(self, past_key_values=None):
141
- if past_key_values is None:
142
- return 0
143
- return self._past_length
144
-
145
- def reshape_vision_model(self):
146
- self.vision_model.reshape(
147
- {
148
- 0: ov.PartialShape([1, 1, 4, 3, self.config.vision_config.image_size, self.config.vision_config.image_size]),
149
- 1: ov.PartialShape([1, 1]),
150
- 2: ov.PartialShape([1, 1, 4]),
151
- }
152
- )
153
-
154
- def update_pkv_precision(self, force_fp32=False):
155
- pkv_precision = ov.Type.f32
156
- if not force_fp32:
157
- device = self._device.upper()
158
- try:
159
- if "INFERENCE_PRECISION_HINT" in core.get_property(device, "SUPPORTED_PROPERTIES"):
160
- pkv_precision = core.get_property(device, "INFERENCE_PRECISION_HINT")
161
- except RuntimeError: # use default precision when get_property fails, e.g. when device is "AUTO:GPU"
162
- pass
163
-
164
- # ov_config["INFERENCE_PRECISION_HINT"] may override the prefer precision
165
- if self.ov_config:
166
- inference_precision_hint = self.ov_config.get("INFERENCE_PRECISION_HINT", "")
167
- if inference_precision_hint in STR_TO_OV_TYPE:
168
- pkv_precision = STR_TO_OV_TYPE[inference_precision_hint]
169
-
170
- ppp = ov.preprocess.PrePostProcessor(self.model)
171
- for key in self.model.inputs:
172
- if "cross_attn_key_values" in key.get_any_name() and pkv_precision != key.get_element_type():
173
- ppp.input(key.get_any_name()).tensor().set_element_type(pkv_precision)
174
-
175
- self.model = ppp.build()
176
-
177
- ppp_v = ov.preprocess.PrePostProcessor(self.vision_model)
178
- for key in self.vision_model.outputs:
179
- if "cross_attn_key_values" in key.get_any_name() and pkv_precision != key.get_element_type():
180
- ppp_v.output(key.get_any_name()).tensor().set_element_type(pkv_precision)
181
- self.vision_model = ppp_v.build()
182
- self._pkv_precision = pkv_precision
183
-
184
- def slice_lm_head(self):
185
- manager = Manager()
186
- manager.register_pass(InsertSlice())
187
- manager.run_passes(self.model)
188
- self.model.validate_nodes_and_infer_types()
189
-
190
- def forward(
191
- self,
192
- input_ids: torch.LongTensor = None,
193
- pixel_values: Optional[torch.FloatTensor] = None,
194
- aspect_ratio_mask: Optional[List[List[int]]] = None,
195
- aspect_ratio_ids: Optional[torch.Tensor] = None,
196
- attention_mask: Optional[List[List[List[int]]]] = None,
197
- cross_attention_mask: Optional[torch.Tensor] = None,
198
- cross_attention_states: Optional[torch.Tensor] = None,
199
- position_ids: Optional[torch.LongTensor] = None,
200
- past_key_values: Optional[List[torch.FloatTensor]] = None,
201
- inputs_embeds: Optional[torch.FloatTensor] = None,
202
- labels: Optional[torch.LongTensor] = None,
203
- use_cache: Optional[bool] = None,
204
- output_attentions: Optional[bool] = None,
205
- output_hidden_states: Optional[bool] = None,
206
- return_dict: Optional[bool] = None,
207
- cache_position: Optional[torch.LongTensor] = None,
208
- cross_attn_key_values: Optional[List[torch.Tensor]] = None,
209
- num_logits_to_keep: int = 0,
210
- ) -> Union[Tuple, MLlamaOutputWithPast]:
211
- r"""
212
- Args:
213
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
214
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
215
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
216
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
217
-
218
- num_logits_to_keep (`int`, *optional*):
219
- Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
220
- `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
221
- token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
222
-
223
-
224
- """
225
-
226
- if (input_ids is None) ^ (inputs_embeds is not None):
227
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one")
228
-
229
- if pixel_values is not None and inputs_embeds is not None:
230
- raise ValueError("You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one")
231
-
232
- if pixel_values is not None and cross_attention_states is not None:
233
- raise ValueError("`pixel_values` and `cross_attention_states` cannot be provided simultaneously")
234
-
235
- if pixel_values is not None:
236
- if aspect_ratio_ids is None:
237
- raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
238
- # get vision tokens from vision model
239
- cross_attn_key_values = self.visual_encoder(pixel_values, aspect_ratio_ids, aspect_ratio_mask)
240
- cross_attention_mask, full_text_row_masked_out_mask = self._prepare_cross_attention_mask(
241
- cross_attention_mask,
242
- past_key_values=past_key_values,
243
- num_vision_tokens=self.num_patches,
244
- cross_attention_layers=cross_attn_key_values if past_key_values is not None else None,
245
- cross_attention_states=((),),
246
- device=self.device,
247
- dtype=torch.float32,
248
- )
249
-
250
- if cross_attention_mask is not None and cache_position is not None:
251
- cross_attention_mask = cross_attention_mask[:, :, cache_position]
252
- full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
253
-
254
- return self.language_model(
255
- input_ids=input_ids,
256
- attention_mask=attention_mask,
257
- position_ids=position_ids,
258
- cross_attention_mask=cross_attention_mask,
259
- full_text_row_masked_out_mask=full_text_row_masked_out_mask,
260
- past_key_values=past_key_values,
261
- cache_position=cache_position,
262
- cross_attention_key_values=cross_attn_key_values,
263
- )
264
-
265
- def language_model(
266
- self,
267
- input_ids,
268
- attention_mask,
269
- position_ids,
270
- cross_attention_mask,
271
- full_text_row_masked_out_mask,
272
- past_key_values,
273
- cache_position,
274
- cross_attention_key_values,
275
- ):
276
- model_inputs = {
277
- "input_ids": ov.Tensor(np.array(input_ids)),
278
- "attention_mask": ov.Tensor(np.array(attention_mask)),
279
- "position_ids": ov.Tensor(np.array(position_ids)),
280
- "cross_attention_mask": ov.Tensor(np.array(cross_attention_mask)),
281
- "full_text_row_masked_out_mask": ov.Tensor(np.array(full_text_row_masked_out_mask)),
282
- "cache_position": ov.Tensor(np.array(cache_position)),
283
- }
284
-
285
- if past_key_values is None:
286
- self.request.reset_state()
287
- self.next_beam_idx = np.arange(input_ids.shape[0], dtype=int)
288
- self._past_length = 0
289
- self.llm_infer_time = []
290
-
291
- if not self.use_remote_tensors:
292
- model_inputs.update(dict(zip(self.lm_cross_attn_inputs, cross_attention_key_values)))
293
- if "beam_idx" in self.input_names:
294
- model_inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(input_ids.shape[0], dtype=int)
295
-
296
- start = time.perf_counter()
297
- self.request.start_async(model_inputs, share_inputs=True)
298
- self.request.wait()
299
- end = time.perf_counter()
300
- self.llm_infer_time.append(end - start)
301
- logits = torch.from_numpy(self.request.get_tensor("logits").data)
302
- past_key_values = ((),)
303
- self._past_length += input_ids.shape[1]
304
- out = MLlamaOutputWithPast(logits=logits, past_key_values=past_key_values, cross_attn_key_values=cross_attention_key_values)
305
- return out
306
-
307
- def can_generate(self):
308
- """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
309
- return True
310
-
311
- def __call__(self, *args, **kwargs) -> MLlamaOutputWithPast:
312
- return self.forward(
313
- *args,
314
- **kwargs,
315
- )
316
-
317
- def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
318
- """
319
- This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
320
- [`~PreTrainedModel.beam_sample`] is called.
321
- This is required to match `past_key_values` with the correct beam_idx at every generation step.
322
- """
323
- self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration
324
- return past_key_values
325
-
326
- def prepare_inputs_for_generation(
327
- self,
328
- input_ids=None,
329
- inputs_embeds=None,
330
- attention_mask=None,
331
- position_ids=None,
332
- pixel_values=None,
333
- aspect_ratio_ids=None,
334
- aspect_ratio_mask=None,
335
- cross_attention_mask=None,
336
- past_key_values=None,
337
- use_cache=False,
338
- cache_position=None,
339
- cross_attn_key_values=None,
340
- num_logits_to_keep=None,
341
- **kwargs,
342
- ):
343
- # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
344
- # Exception 1: when passing input_embeds, input_ids may be missing entries
345
- # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
346
- if past_key_values is not None:
347
- if inputs_embeds is not None: # Exception 1
348
- input_ids = input_ids[:, -cache_position.shape[0] :]
349
- elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
350
- input_ids = input_ids[:, cache_position]
351
-
352
- if attention_mask is not None and position_ids is None:
353
- # create position_ids on the fly for batch generation
354
- position_ids = attention_mask.long().cumsum(-1) - 1
355
- position_ids.masked_fill_(attention_mask == 0, 1)
356
- if past_key_values:
357
- position_ids = position_ids[:, -input_ids.shape[1] :]
358
-
359
- # The clone here is for the same reason as for `position_ids`.
360
- model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
361
-
362
- if num_logits_to_keep is not None:
363
- model_inputs["num_logits_to_keep"] = num_logits_to_keep
364
-
365
- model_inputs.update(
366
- {
367
- "position_ids": position_ids,
368
- "cache_position": cache_position,
369
- "past_key_values": past_key_values,
370
- "use_cache": use_cache,
371
- "attention_mask": attention_mask,
372
- "cross_attention_mask": cross_attention_mask,
373
- "cross_attn_key_values": cross_attn_key_values,
374
- }
375
- )
376
-
377
- # If we're in pre-fill or cacheless decoding step, then we need pixel_values and aspect ratios
378
- # to compute image hidden states, otherwise they are cache/home/ea/llama3.2/Llama-3.2-11B-Vision-Early/OVd within each cross attn layer
379
- if (input_ids == self.config.image_token_index).any():
380
- model_inputs["pixel_values"] = pixel_values
381
- model_inputs["aspect_ratio_ids"] = aspect_ratio_ids
382
- model_inputs["aspect_ratio_mask"] = aspect_ratio_mask
383
-
384
- return model_inputs
385
-
386
- def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
387
- cross_attention_mask_prev = model_kwargs.get("cross_attention_mask", None)
388
- model_kwargs = super()._update_model_kwargs_for_generation(
389
- outputs=outputs,
390
- model_kwargs=model_kwargs,
391
- is_encoder_decoder=is_encoder_decoder,
392
- **kwargs,
393
- )
394
-
395
- # add cross-attn mask for new token
396
- if cross_attention_mask_prev is not None:
397
- model_kwargs["cross_attention_mask"] = torch.cat([cross_attention_mask_prev, cross_attention_mask_prev[:, -1:, ...]], dim=1)
398
- model_kwargs["cross_attn_key_values"] = outputs.cross_attn_key_values
399
- return model_kwargs
400
-
401
- def _prepare_cross_attention_mask(
402
- self,
403
- cross_attention_mask: torch.Tensor,
404
- past_key_values: Tuple,
405
- num_vision_tokens: int,
406
- cross_attention_states: torch.Tensor,
407
- cross_attention_layers: List[int],
408
- device: str,
409
- dtype: str,
410
- ) -> Tuple[torch.Tensor, torch.Tensor]:
411
- if cross_attention_mask is None:
412
- # should we raise error or prepare a full attn mask with all ones?
413
- return None, None
414
- else:
415
- # reshape so it can be used by attn module
416
- batch_size, text_total_length, *_ = cross_attention_mask.shape
417
- cross_attention_mask = cross_attention_mask.repeat_interleave(num_vision_tokens, dim=3)
418
- cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
419
- cross_attention_mask = cross_attention_mask.unsqueeze(1)
420
-
421
- # invert the mask
422
- inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
423
- cross_attention_mask = inverted_cross_attn_mask.masked_fill(inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min)
424
-
425
- # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
426
- # last dimension contains negative infinity values, otherwise it's 1
427
- negative_inf_value = torch.finfo(dtype).min
428
- full_text_row_masked_out_mask = (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None]
429
- cross_attention_mask *= full_text_row_masked_out_mask
430
-
431
- # In case we receive a new image but already have previous cross-attention key/values in cache,
432
- # then we need to extend the attention-mask and add previous images' lengths
433
- if past_key_values is not None and cross_attention_states is not None and cross_attention_layers is not None:
434
- # make all zeros mask for cross-attn-mask from previuos cached hidden_states, all zeros right?
435
- # i.e. extend current cross-attn-mask on image-seq-length dimension to account for past_seen_tokens
436
- past_cross_attn_kv_length = cross_attention_layers[0].shape[-2]
437
- past_cross_attn_mask = torch.zeros((*cross_attention_mask.shape[:-1], past_cross_attn_kv_length), dtype=dtype, device=device)
438
- # concatenate both on image-seq-length dimension
439
- cross_attention_mask = torch.cat([past_cross_attn_mask, cross_attention_mask], dim=-1)
440
-
441
- return cross_attention_mask, full_text_row_masked_out_mask
442
-
443
- def visual_encoder(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
444
- if pixel_values is not None:
445
- if aspect_ratio_ids is None:
446
- raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
447
- self.vision_encoder_infer_time = []
448
- start = time.perf_counter()
449
- # get vision tokens from vision model
450
- self.vision_request.start_async([pixel_values, aspect_ratio_ids, aspect_ratio_mask], share_inputs=True)
451
- self.vision_request.wait()
452
- end = time.perf_counter()
453
- cross_attn_key_values = [self.vision_request.get_tensor(name) for name in self.cross_attn_outputs]
454
- self.vision_encoder_infer_time.append(end - start)
455
- return cross_attn_key_values
456
-
457
- def prepare_vision_outputs(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask, cross_attention_mask=None, past_key_values=None, cache_position=None):
458
- cross_attn_key_values = self.visual_encoder(pixel_values, aspect_ratio_ids, aspect_ratio_mask)
459
- cross_attn_key_values = [v.data for v in cross_attn_key_values]
460
- cross_attention_mask, full_text_row_masked_out_mask = self._prepare_cross_attention_mask(
461
- cross_attention_mask,
462
- past_key_values=past_key_values,
463
- num_vision_tokens=self.num_patches,
464
- cross_attention_layers=cross_attn_key_values if past_key_values is not None else None,
465
- cross_attention_states=1,
466
- device=self.device,
467
- dtype=torch.float32,
468
- )
469
-
470
- if cross_attention_mask is not None and cache_position is not None:
471
- cross_attention_mask = cross_attention_mask[:, :, cache_position]
472
- full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
473
-
474
- return {
475
- "cross_attention_mask": cross_attention_mask,
476
- "full_text_row_masked_out_mask": full_text_row_masked_out_mask,
477
- "past_key_values": past_key_values,
478
- "cache_position": cache_position,
479
- "cross_attention_key_values": cross_attn_key_values,
480
- }
481
-
482
- def prepare_llm_inputs(
483
- self,
484
- input_ids,
485
- attention_mask,
486
- position_ids,
487
- cross_attention_mask,
488
- full_text_row_masked_out_mask,
489
- past_key_values,
490
- cache_position,
491
- cross_attention_key_values,
492
- ):
493
- model_inputs = {
494
- "input_ids": input_ids,
495
- "attention_mask": attention_mask,
496
- "position_ids": position_ids,
497
- "cross_attention_mask": cross_attention_mask,
498
- "full_text_row_masked_out_mask": full_text_row_masked_out_mask,
499
- "cache_position": cache_position,
500
- }
501
-
502
- if past_key_values is None:
503
- self.request.reset_state()
504
- self.next_beam_idx = np.arange(input_ids.shape[0], dtype=int)
505
- self._past_length = 0
506
-
507
- model_inputs.update(dict(zip(self.lm_cross_attn_inputs, cross_attention_key_values)))
508
- if "beam_idx" in self.input_names:
509
- model_inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(input_ids.shape[0], dtype=int)
510
-
511
- return model_inputs
512
-
513
- def prepare_remote_tensors(self):
514
- context = core.get_default_context("GPU")
515
- for idx, name in enumerate(self.lm_cross_attn_inputs):
516
- remote_tensor = context.create_tensor(ov.Type.f16, ov.Shape([1, 32, 6404, 128]), {})
517
- self.vision_request.set_tensor(self.cross_attn_outputs[idx], remote_tensor)
518
- self.request.set_tensor(name, remote_tensor)