Spaces:

lym0302
/

DeepSound-V1

Running

App Files Files Community

lym0302123 commited on Mar 25

Commit

28e0d96

1 Parent(s): 2e6c958

cuda->cpu

Browse files

Files changed (2) hide show

third_party/VideoLLaMA2/videollama2/__init__.py +16 -5
third_party/VideoLLaMA2/videollama2/model/__init__.py +2 -1

third_party/VideoLLaMA2/videollama2/__init__.py CHANGED Viewed

@@ -10,11 +10,12 @@ from .model import load_pretrained_model
 from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
 from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
 def model_init(model_path=None, **kwargs):
     model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
     model_name = get_model_name_from_path(model_path)
-    tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
     if tokenizer.pad_token is None and tokenizer.unk_token is not None:
         tokenizer.pad_token = tokenizer.unk_token
@@ -60,9 +61,15 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
         tensor = None
     else:
         if isinstance(image_or_video, dict):
-            tensor = {k: v.half().cuda() for k, v in image_or_video.items()}
         else:
-            tensor = image_or_video.half().cuda()
         tensor = [(tensor, modal)]
     # 2. text preprocess (tag process & generate prompt).
@@ -88,8 +95,12 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
     message = system_message + message
     prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
-    input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
-    attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
     # 3. generate response according to visual signals and prompts.
     keywords = [tokenizer.eos_token]

 from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria, process_audio_file
 from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP, DEFAULT_AUDIO_TOKEN
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 def model_init(model_path=None, **kwargs):
     model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
     model_name = get_model_name_from_path(model_path)
+    tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, device=DEVICE, **kwargs)
     if tokenizer.pad_token is None and tokenizer.unk_token is not None:
         tokenizer.pad_token = tokenizer.unk_token
         tensor = None
     else:
         if isinstance(image_or_video, dict):
+            if DEVICE == "cuda":
+                tensor = {k: v.half().cuda() for k, v in image_or_video.items()}
+            else:
+                tensor = {k: v.half().cpu() for k, v in image_or_video.items()}
         else:
+            if DEVICE == "cuda":
+                tensor = image_or_video.half().cuda()
+            else:
+                tensor = image_or_video.half().cpu()
         tensor = [(tensor, modal)]
     # 2. text preprocess (tag process & generate prompt).
     message = system_message + message
     prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
+    if DEVICE == "cuda":
+        input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
+        attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
+    else:
+        input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cpu()
+        attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cpu()
     # 3. generate response according to visual signals and prompts.
     keywords = [tokenizer.eos_token]

third_party/VideoLLaMA2/videollama2/model/__init__.py CHANGED Viewed

@@ -51,6 +51,7 @@ VLLMConfigs = {
 }
 def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
     if 'token' in kwargs:
@@ -209,6 +210,6 @@ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, l
     if hasattr(model.config, "mm_audio_tower"):
         nname = model.config.mm_audio_tower.split("/")[-1]
         model.config.mm_audio_tower = os.path.join(model_path, nname)
     return tokenizer, model, processor, context_len

 }
 def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
     if 'token' in kwargs:
     if hasattr(model.config, "mm_audio_tower"):
         nname = model.config.mm_audio_tower.split("/")[-1]
         model.config.mm_audio_tower = os.path.join(model_path, nname)
     return tokenizer, model, processor, context_len