Spaces:
Running
Running
lym0302
commited on
Commit
·
28e0d96
1
Parent(s):
2e6c958
cuda->cpu
Browse files
third_party/VideoLLaMA2/videollama2/__init__.py
CHANGED
@@ -10,11 +10,12 @@ from .model import load_pretrained_model
|
|
10 |
from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
|
11 |
from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
|
12 |
|
|
|
13 |
|
14 |
def model_init(model_path=None, **kwargs):
|
15 |
model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
|
16 |
model_name = get_model_name_from_path(model_path)
|
17 |
-
tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
|
18 |
|
19 |
if tokenizer.pad_token is None and tokenizer.unk_token is not None:
|
20 |
tokenizer.pad_token = tokenizer.unk_token
|
@@ -60,9 +61,15 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
|
|
60 |
tensor = None
|
61 |
else:
|
62 |
if isinstance(image_or_video, dict):
|
63 |
-
|
|
|
|
|
|
|
64 |
else:
|
65 |
-
|
|
|
|
|
|
|
66 |
tensor = [(tensor, modal)]
|
67 |
|
68 |
# 2. text preprocess (tag process & generate prompt).
|
@@ -88,8 +95,12 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
|
|
88 |
message = system_message + message
|
89 |
prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
|
90 |
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
|
94 |
# 3. generate response according to visual signals and prompts.
|
95 |
keywords = [tokenizer.eos_token]
|
|
|
10 |
from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
|
11 |
from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
|
12 |
|
13 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
14 |
|
15 |
def model_init(model_path=None, **kwargs):
|
16 |
model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
|
17 |
model_name = get_model_name_from_path(model_path)
|
18 |
+
tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, device=DEVICE, **kwargs)
|
19 |
|
20 |
if tokenizer.pad_token is None and tokenizer.unk_token is not None:
|
21 |
tokenizer.pad_token = tokenizer.unk_token
|
|
|
61 |
tensor = None
|
62 |
else:
|
63 |
if isinstance(image_or_video, dict):
|
64 |
+
if DEVICE == "cuda":
|
65 |
+
tensor = {k: v.half().cuda() for k, v in image_or_video.items()}
|
66 |
+
else:
|
67 |
+
tensor = {k: v.half().cpu() for k, v in image_or_video.items()}
|
68 |
else:
|
69 |
+
if DEVICE == "cuda":
|
70 |
+
tensor = image_or_video.half().cuda()
|
71 |
+
else:
|
72 |
+
tensor = image_or_video.half().cpu()
|
73 |
tensor = [(tensor, modal)]
|
74 |
|
75 |
# 2. text preprocess (tag process & generate prompt).
|
|
|
95 |
message = system_message + message
|
96 |
prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
|
97 |
|
98 |
+
if DEVICE == "cuda":
|
99 |
+
input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
|
100 |
+
attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
|
101 |
+
else:
|
102 |
+
input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cpu()
|
103 |
+
attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cpu()
|
104 |
|
105 |
# 3. generate response according to visual signals and prompts.
|
106 |
keywords = [tokenizer.eos_token]
|
third_party/VideoLLaMA2/videollama2/model/__init__.py
CHANGED
@@ -51,6 +51,7 @@ VLLMConfigs = {
|
|
51 |
}
|
52 |
|
53 |
|
|
|
54 |
def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
|
55 |
|
56 |
if 'token' in kwargs:
|
@@ -209,6 +210,6 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
|
|
209 |
if hasattr(model.config, "mm_audio_tower"):
|
210 |
nname = model.config.mm_audio_tower.split("/")[-1]
|
211 |
model.config.mm_audio_tower = os.path.join(model_path, nname)
|
212 |
-
|
213 |
|
214 |
return tokenizer, model, processor, context_len
|
|
|
51 |
}
|
52 |
|
53 |
|
54 |
+
|
55 |
def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
|
56 |
|
57 |
if 'token' in kwargs:
|
|
|
210 |
if hasattr(model.config, "mm_audio_tower"):
|
211 |
nname = model.config.mm_audio_tower.split("/")[-1]
|
212 |
model.config.mm_audio_tower = os.path.join(model_path, nname)
|
213 |
+
|
214 |
|
215 |
return tokenizer, model, processor, context_len
|