lym0302 commited on
Commit
2e6c958
·
1 Parent(s): 90a9098
Files changed (2) hide show
  1. pipeline/pipeline.py +8 -7
  2. pipeline/step02.py +73 -0
pipeline/pipeline.py CHANGED
@@ -1,28 +1,29 @@
1
  # coding=utf-8
2
 
3
- from .step0 import Step0
4
  from .step1 import Step1
5
- from .step2 import Step2
6
  from .step3 import Step3
7
  from .step4 import Step4
 
8
  import logging
9
  import re
10
  import os
11
 
12
  class Pipeline:
13
  def __init__(self, step0_model_dir, step1_mode, step2_model_dir, step2_mode, step3_mode):
14
- self.step0 = Step0(step0_model_dir)
 
15
  self.step1 = Step1(step1_mode)
16
- self.step2 = Step2(step2_model_dir, step2_mode)
17
  self.step3 = Step3(model_type=step3_mode)
18
  self.step4 = Step4()
19
- self.step_processors = [self.step1, self.step2, self.step3, self.step4]
20
  self.log = logging.getLogger(self.__class__.__name__)
21
  self.log.setLevel(logging.INFO)
22
 
23
 
24
  def run(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
25
- step0_resp = self.step0.run(video_input)
26
  step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
27
  step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
28
  step3_temp_dir = os.path.join(output_dir, "remove_vo")
@@ -36,7 +37,7 @@ class Pipeline:
36
  step_results["step1_video_path"] = step1_video_path
37
 
38
  elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
39
- is_vo = self.step2.run(str(step_results["step1_video_path"]))
40
  step_results["is_vo"] = is_vo
41
  if not step_results["is_vo"]: # not voice-over
42
  step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
 
1
  # coding=utf-8
2
 
3
+ # from .step0 import Step0
4
  from .step1 import Step1
5
+ # from .step2 import Step2
6
  from .step3 import Step3
7
  from .step4 import Step4
8
+ from .step02 import Step02
9
  import logging
10
  import re
11
  import os
12
 
13
  class Pipeline:
14
  def __init__(self, step0_model_dir, step1_mode, step2_model_dir, step2_mode, step3_mode):
15
+ # self.step0 = Step0(step0_model_dir)
16
+ self.step02 = Step02(step0_model_dir, step2_mode)
17
  self.step1 = Step1(step1_mode)
18
+ # self.step2 = Step2(step2_model_dir, step2_mode)
19
  self.step3 = Step3(model_type=step3_mode)
20
  self.step4 = Step4()
 
21
  self.log = logging.getLogger(self.__class__.__name__)
22
  self.log.setLevel(logging.INFO)
23
 
24
 
25
  def run(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
26
+ step0_resp = self.step02.run_step0(video_input)
27
  step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
28
  step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
29
  step3_temp_dir = os.path.join(output_dir, "remove_vo")
 
37
  step_results["step1_video_path"] = step1_video_path
38
 
39
  elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
40
+ is_vo = self.step02.run_step2(str(step_results["step1_video_path"]))
41
  step_results["is_vo"] = is_vo
42
  if not step_results["is_vo"]: # not voice-over
43
  step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
pipeline/step02.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # judge voice-over
3
+
4
+ from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
5
+ import logging
6
+
7
+
8
+ class Step02:
9
+ def __init__(self, model_path, step2_mode):
10
+ self.modal = "video"
11
+ self.log = logging.getLogger(self.__class__.__name__)
12
+ self.log.setLevel(logging.INFO)
13
+ self.model, self.processor, self.tokenizer = model_init(model_path)
14
+ self.preprocess = self.processor[self.modal]
15
+ self.step2_mode = step2_mode
16
+
17
+ def run_step0(self, video_path, modal_type='v'):
18
+ question = f"Generate high-quality audio from video step-by-step."
19
+ if modal_type == "a":
20
+ self.model.model.vision_tower = None
21
+ elif modal_type == "v":
22
+ self.model.model.audio_tower = None
23
+ elif modal_type == "av":
24
+ pass
25
+ else:
26
+ raise NotImplementedError
27
+
28
+ self.log.info("######################################################################################################")
29
+ self.log.info("Generate high-quality audio from video step-by-step...")
30
+ audio_video_tensor = self.preprocess(video_path, va=False)
31
+ output = mm_infer(
32
+ audio_video_tensor,
33
+ question,
34
+ model=self.model,
35
+ tokenizer=self.tokenizer,
36
+ modal=self.modal,
37
+ do_sample=False,
38
+ )
39
+
40
+ return output
41
+
42
+
43
+ def run_step2(self, video_audio_path, modal_type='av'):
44
+ question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B."
45
+ if modal_type == "a":
46
+ self.model.model.vision_tower = None
47
+ elif modal_type == "v":
48
+ self.model.model.audio_tower = None
49
+ elif modal_type == "av":
50
+ pass
51
+ else:
52
+ raise NotImplementedError
53
+ audio_video_tensor = self.preprocess(video_audio_path, va=True)
54
+ output = mm_infer(
55
+ audio_video_tensor,
56
+ question,
57
+ model=self.model,
58
+ tokenizer=self.tokenizer,
59
+ modal=self.modal,
60
+ do_sample=False,
61
+ )
62
+
63
+ if self.step2_mode == "cot":
64
+ output = output.split("<CONCLUSION>")[-1][1]
65
+ print("1111111111111111111111111: ", output)
66
+ output = (output == "A")
67
+
68
+ if output:
69
+ self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.")
70
+ else:
71
+ self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.")
72
+ self.log.info("Finish Step2 successfully.\n")
73
+ return output