File size: 2,698 Bytes
2e6c958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4083b70
 
 
 
 
 
 
 
2e6c958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4083b70
 
 
 
 
 
 
 
2e6c958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# coding=utf-8
# judge voice-over

from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
import logging

    
class Step02:
    def __init__(self, model_path, step2_mode):
        self.modal = "video"
        self.log = logging.getLogger(self.__class__.__name__)
        self.log.setLevel(logging.INFO)
        self.model, self.processor, self.tokenizer = model_init(model_path)
        self.preprocess = self.processor[self.modal]
        self.step2_mode = step2_mode

    def run_step0(self, video_path, modal_type='v'):
        question = f"Generate high-quality audio from video step-by-step."
        # if modal_type == "a":
        #     self.model.model.vision_tower = None
        # elif modal_type == "v":
        #     self.model.model.audio_tower = None
        # elif modal_type == "av":
        #     pass
        # else:
        #     raise NotImplementedError

        self.log.info("######################################################################################################")
        self.log.info("Generate high-quality audio from video step-by-step...")
        audio_video_tensor = self.preprocess(video_path, va=False)
        output = mm_infer(
            audio_video_tensor,
            question,
            model=self.model,
            tokenizer=self.tokenizer,
            modal=self.modal,
            do_sample=False,
        )

        return output


    def run_step2(self, video_audio_path, modal_type='av'):
        question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B."
        # if modal_type == "a":
        #     self.model.model.vision_tower = None
        # elif modal_type == "v":
        #     self.model.model.audio_tower = None
        # elif modal_type == "av":
        #     pass
        # else:
        #     raise NotImplementedError
        audio_video_tensor = self.preprocess(video_audio_path, va=True)
        output = mm_infer(
            audio_video_tensor,
            question,
            model=self.model,
            tokenizer=self.tokenizer,
            modal=self.modal,
            do_sample=False,
        )
        
        if self.step2_mode == "cot":
            output = output.split("<CONCLUSION>")[-1][1]
        print("1111111111111111111111111: ", output)
        output = (output == "A")

        if output:
            self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.")
        else:
            self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.")
        self.log.info("Finish Step2 successfully.\n")
        return output