doberst commited on
Commit
eff599d
1 Parent(s): b3d87a1

Delete ov_mllama_generator_script.py

Browse files
Files changed (1) hide show
  1. ov_mllama_generator_script.py +0 -51
ov_mllama_generator_script.py DELETED
@@ -1,51 +0,0 @@
1
-
2
- """ Main inference generation for mLlama-3.2-11B compressed and packaged as OV model
3
-
4
- -- accompanying generator_class file - ov_mllama_generator_class.py
5
-
6
- -- dependencies: transformers and torch
7
-
8
- """
9
-
10
- import requests
11
- import openvino as ov
12
-
13
- from PIL import Image
14
- from transformers import TextStreamer, AutoProcessor
15
- import numpy as np
16
-
17
- from ov_mllama_generator_class import OVMLlamaForConditionalGeneration
18
-
19
- model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
20
- model_dir = "C:\\Users\\darre\\llmware_data\\model_repo\\llama-11b-vision-instruct-ov"
21
-
22
- core = ov.Core()
23
-
24
- language_model_name = "llm_int4_asym_r10_gs64_max_activation_variance_scale_all_layers.xml"
25
- vision_encoder_name = "openvino_vision_encoder_int8.xml"
26
- device="CPU"
27
-
28
- ov_model = OVMLlamaForConditionalGeneration(model_dir, device=device,
29
- language_model_name=language_model_name,
30
- image_encoder_name=vision_encoder_name)
31
-
32
- processor = AutoProcessor.from_pretrained(model_dir)
33
-
34
- question = "What is unusual on this image?"
35
-
36
- messages = [
37
- {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]},
38
- ]
39
- text = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
40
- url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11"
41
- raw_image = Image.open(requests.get(url, stream=True).raw)
42
-
43
- inputs = processor(text=text, images=[raw_image], return_tensors="pt")
44
- streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
45
- print(f"Question: {question}")
46
-
47
- output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=100, temperature=None, top_p=None, streamer=streamer)
48
- print(f"Visual encoder time {ov_model.vision_encoder_infer_time[0] * 1000 :.2f} ms")
49
- print(f"First token latency {ov_model.llm_infer_time[0] * 1000 :.2f}ms, Second token latency {np.mean(np.array(ov_model.llm_infer_time[1:])) * 1000:.2f}ms")
50
-
51
-