Delete ov_mllama_generator_script.py
Browse files
ov_mllama_generator_script.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
|
2 |
-
""" Main inference generation for mLlama-3.2-11B compressed and packaged as OV model
|
3 |
-
|
4 |
-
-- accompanying generator_class file - ov_mllama_generator_class.py
|
5 |
-
|
6 |
-
-- dependencies: transformers and torch
|
7 |
-
|
8 |
-
"""
|
9 |
-
|
10 |
-
import requests
|
11 |
-
import openvino as ov
|
12 |
-
|
13 |
-
from PIL import Image
|
14 |
-
from transformers import TextStreamer, AutoProcessor
|
15 |
-
import numpy as np
|
16 |
-
|
17 |
-
from ov_mllama_generator_class import OVMLlamaForConditionalGeneration
|
18 |
-
|
19 |
-
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
20 |
-
model_dir = "C:\\Users\\darre\\llmware_data\\model_repo\\llama-11b-vision-instruct-ov"
|
21 |
-
|
22 |
-
core = ov.Core()
|
23 |
-
|
24 |
-
language_model_name = "llm_int4_asym_r10_gs64_max_activation_variance_scale_all_layers.xml"
|
25 |
-
vision_encoder_name = "openvino_vision_encoder_int8.xml"
|
26 |
-
device="CPU"
|
27 |
-
|
28 |
-
ov_model = OVMLlamaForConditionalGeneration(model_dir, device=device,
|
29 |
-
language_model_name=language_model_name,
|
30 |
-
image_encoder_name=vision_encoder_name)
|
31 |
-
|
32 |
-
processor = AutoProcessor.from_pretrained(model_dir)
|
33 |
-
|
34 |
-
question = "What is unusual on this image?"
|
35 |
-
|
36 |
-
messages = [
|
37 |
-
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]},
|
38 |
-
]
|
39 |
-
text = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
40 |
-
url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11"
|
41 |
-
raw_image = Image.open(requests.get(url, stream=True).raw)
|
42 |
-
|
43 |
-
inputs = processor(text=text, images=[raw_image], return_tensors="pt")
|
44 |
-
streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
|
45 |
-
print(f"Question: {question}")
|
46 |
-
|
47 |
-
output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=100, temperature=None, top_p=None, streamer=streamer)
|
48 |
-
print(f"Visual encoder time {ov_model.vision_encoder_infer_time[0] * 1000 :.2f} ms")
|
49 |
-
print(f"First token latency {ov_model.llm_infer_time[0] * 1000 :.2f}ms, Second token latency {np.mean(np.array(ov_model.llm_infer_time[1:])) * 1000:.2f}ms")
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|