lym0302 commited on
Commit
28e0d96
·
1 Parent(s): 2e6c958
third_party/VideoLLaMA2/videollama2/__init__.py CHANGED
@@ -10,11 +10,12 @@ from .model import load_pretrained_model
10
  from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
11
  from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
12
 
 
13
 
14
  def model_init(model_path=None, **kwargs):
15
  model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
16
  model_name = get_model_name_from_path(model_path)
17
- tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
18
 
19
  if tokenizer.pad_token is None and tokenizer.unk_token is not None:
20
  tokenizer.pad_token = tokenizer.unk_token
@@ -60,9 +61,15 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
60
  tensor = None
61
  else:
62
  if isinstance(image_or_video, dict):
63
- tensor = {k: v.half().cuda() for k, v in image_or_video.items()}
 
 
 
64
  else:
65
- tensor = image_or_video.half().cuda()
 
 
 
66
  tensor = [(tensor, modal)]
67
 
68
  # 2. text preprocess (tag process & generate prompt).
@@ -88,8 +95,12 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
88
  message = system_message + message
89
  prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
90
 
91
- input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
92
- attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
 
 
 
 
93
 
94
  # 3. generate response according to visual signals and prompts.
95
  keywords = [tokenizer.eos_token]
 
10
  from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
11
  from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
12
 
13
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
14
 
15
  def model_init(model_path=None, **kwargs):
16
  model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
17
  model_name = get_model_name_from_path(model_path)
18
+ tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, device=DEVICE, **kwargs)
19
 
20
  if tokenizer.pad_token is None and tokenizer.unk_token is not None:
21
  tokenizer.pad_token = tokenizer.unk_token
 
61
  tensor = None
62
  else:
63
  if isinstance(image_or_video, dict):
64
+ if DEVICE == "cuda":
65
+ tensor = {k: v.half().cuda() for k, v in image_or_video.items()}
66
+ else:
67
+ tensor = {k: v.half().cpu() for k, v in image_or_video.items()}
68
  else:
69
+ if DEVICE == "cuda":
70
+ tensor = image_or_video.half().cuda()
71
+ else:
72
+ tensor = image_or_video.half().cpu()
73
  tensor = [(tensor, modal)]
74
 
75
  # 2. text preprocess (tag process & generate prompt).
 
95
  message = system_message + message
96
  prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
97
 
98
+ if DEVICE == "cuda":
99
+ input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
100
+ attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
101
+ else:
102
+ input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cpu()
103
+ attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cpu()
104
 
105
  # 3. generate response according to visual signals and prompts.
106
  keywords = [tokenizer.eos_token]
third_party/VideoLLaMA2/videollama2/model/__init__.py CHANGED
@@ -51,6 +51,7 @@ VLLMConfigs = {
51
  }
52
 
53
 
 
54
  def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
55
 
56
  if 'token' in kwargs:
@@ -209,6 +210,6 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
209
  if hasattr(model.config, "mm_audio_tower"):
210
  nname = model.config.mm_audio_tower.split("/")[-1]
211
  model.config.mm_audio_tower = os.path.join(model_path, nname)
212
-
213
 
214
  return tokenizer, model, processor, context_len
 
51
  }
52
 
53
 
54
+
55
  def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
56
 
57
  if 'token' in kwargs:
 
210
  if hasattr(model.config, "mm_audio_tower"):
211
  nname = model.config.mm_audio_tower.split("/")[-1]
212
  model.config.mm_audio_tower = os.path.join(model_path, nname)
213
+
214
 
215
  return tokenizer, model, processor, context_len