Spaces:
Running
Running
File size: 2,698 Bytes
2e6c958 4083b70 2e6c958 4083b70 2e6c958 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# coding=utf-8
# judge voice-over
from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
import logging
class Step02:
def __init__(self, model_path, step2_mode):
self.modal = "video"
self.log = logging.getLogger(self.__class__.__name__)
self.log.setLevel(logging.INFO)
self.model, self.processor, self.tokenizer = model_init(model_path)
self.preprocess = self.processor[self.modal]
self.step2_mode = step2_mode
def run_step0(self, video_path, modal_type='v'):
question = f"Generate high-quality audio from video step-by-step."
# if modal_type == "a":
# self.model.model.vision_tower = None
# elif modal_type == "v":
# self.model.model.audio_tower = None
# elif modal_type == "av":
# pass
# else:
# raise NotImplementedError
self.log.info("######################################################################################################")
self.log.info("Generate high-quality audio from video step-by-step...")
audio_video_tensor = self.preprocess(video_path, va=False)
output = mm_infer(
audio_video_tensor,
question,
model=self.model,
tokenizer=self.tokenizer,
modal=self.modal,
do_sample=False,
)
return output
def run_step2(self, video_audio_path, modal_type='av'):
question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B."
# if modal_type == "a":
# self.model.model.vision_tower = None
# elif modal_type == "v":
# self.model.model.audio_tower = None
# elif modal_type == "av":
# pass
# else:
# raise NotImplementedError
audio_video_tensor = self.preprocess(video_audio_path, va=True)
output = mm_infer(
audio_video_tensor,
question,
model=self.model,
tokenizer=self.tokenizer,
modal=self.modal,
do_sample=False,
)
if self.step2_mode == "cot":
output = output.split("<CONCLUSION>")[-1][1]
print("1111111111111111111111111: ", output)
output = (output == "A")
if output:
self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.")
else:
self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.")
self.log.info("Finish Step2 successfully.\n")
return output
|