Spaces:
Running
Running
lym0302
commited on
Commit
·
2e6c958
1
Parent(s):
90a9098
step02
Browse files- pipeline/pipeline.py +8 -7
- pipeline/step02.py +73 -0
pipeline/pipeline.py
CHANGED
@@ -1,28 +1,29 @@
|
|
1 |
# coding=utf-8
|
2 |
|
3 |
-
from .step0 import Step0
|
4 |
from .step1 import Step1
|
5 |
-
from .step2 import Step2
|
6 |
from .step3 import Step3
|
7 |
from .step4 import Step4
|
|
|
8 |
import logging
|
9 |
import re
|
10 |
import os
|
11 |
|
12 |
class Pipeline:
|
13 |
def __init__(self, step0_model_dir, step1_mode, step2_model_dir, step2_mode, step3_mode):
|
14 |
-
self.step0 = Step0(step0_model_dir)
|
|
|
15 |
self.step1 = Step1(step1_mode)
|
16 |
-
self.step2 = Step2(step2_model_dir, step2_mode)
|
17 |
self.step3 = Step3(model_type=step3_mode)
|
18 |
self.step4 = Step4()
|
19 |
-
self.step_processors = [self.step1, self.step2, self.step3, self.step4]
|
20 |
self.log = logging.getLogger(self.__class__.__name__)
|
21 |
self.log.setLevel(logging.INFO)
|
22 |
|
23 |
|
24 |
def run(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
|
25 |
-
step0_resp = self.
|
26 |
step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
|
27 |
step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
|
28 |
step3_temp_dir = os.path.join(output_dir, "remove_vo")
|
@@ -36,7 +37,7 @@ class Pipeline:
|
|
36 |
step_results["step1_video_path"] = step1_video_path
|
37 |
|
38 |
elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
|
39 |
-
is_vo = self.
|
40 |
step_results["is_vo"] = is_vo
|
41 |
if not step_results["is_vo"]: # not voice-over
|
42 |
step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
|
|
|
1 |
# coding=utf-8
|
2 |
|
3 |
+
# from .step0 import Step0
|
4 |
from .step1 import Step1
|
5 |
+
# from .step2 import Step2
|
6 |
from .step3 import Step3
|
7 |
from .step4 import Step4
|
8 |
+
from .step02 import Step02
|
9 |
import logging
|
10 |
import re
|
11 |
import os
|
12 |
|
13 |
class Pipeline:
|
14 |
def __init__(self, step0_model_dir, step1_mode, step2_model_dir, step2_mode, step3_mode):
|
15 |
+
# self.step0 = Step0(step0_model_dir)
|
16 |
+
self.step02 = Step02(step0_model_dir, step2_mode)
|
17 |
self.step1 = Step1(step1_mode)
|
18 |
+
# self.step2 = Step2(step2_model_dir, step2_mode)
|
19 |
self.step3 = Step3(model_type=step3_mode)
|
20 |
self.step4 = Step4()
|
|
|
21 |
self.log = logging.getLogger(self.__class__.__name__)
|
22 |
self.log.setLevel(logging.INFO)
|
23 |
|
24 |
|
25 |
def run(self, video_input, output_dir, mode='s4', postp_mode='rep', prompt='', negative_prompt='', duration=10, seed=42):
|
26 |
+
step0_resp = self.step02.run_step0(video_input)
|
27 |
step0_resp_list = re.findall(r'(Step\d:.*?)(?=Step\d:|$)', step0_resp, re.DOTALL)
|
28 |
step_infos = [step_info.strip().split("\n")[0] for step_info in step0_resp_list]
|
29 |
step3_temp_dir = os.path.join(output_dir, "remove_vo")
|
|
|
37 |
step_results["step1_video_path"] = step1_video_path
|
38 |
|
39 |
elif step_info == 'Step2: Given a video and its generated audio, determine whether the audio contains voice-over.':
|
40 |
+
is_vo = self.step02.run_step2(str(step_results["step1_video_path"]))
|
41 |
step_results["is_vo"] = is_vo
|
42 |
if not step_results["is_vo"]: # not voice-over
|
43 |
step_results["temp_final_audio_path"] = step_results["step1_audio_path"]
|
pipeline/step02.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# judge voice-over
|
3 |
+
|
4 |
+
from third_party.VideoLLaMA2.videollama2 import model_init, mm_infer
|
5 |
+
import logging
|
6 |
+
|
7 |
+
|
8 |
+
class Step02:
|
9 |
+
def __init__(self, model_path, step2_mode):
|
10 |
+
self.modal = "video"
|
11 |
+
self.log = logging.getLogger(self.__class__.__name__)
|
12 |
+
self.log.setLevel(logging.INFO)
|
13 |
+
self.model, self.processor, self.tokenizer = model_init(model_path)
|
14 |
+
self.preprocess = self.processor[self.modal]
|
15 |
+
self.step2_mode = step2_mode
|
16 |
+
|
17 |
+
def run_step0(self, video_path, modal_type='v'):
|
18 |
+
question = f"Generate high-quality audio from video step-by-step."
|
19 |
+
if modal_type == "a":
|
20 |
+
self.model.model.vision_tower = None
|
21 |
+
elif modal_type == "v":
|
22 |
+
self.model.model.audio_tower = None
|
23 |
+
elif modal_type == "av":
|
24 |
+
pass
|
25 |
+
else:
|
26 |
+
raise NotImplementedError
|
27 |
+
|
28 |
+
self.log.info("######################################################################################################")
|
29 |
+
self.log.info("Generate high-quality audio from video step-by-step...")
|
30 |
+
audio_video_tensor = self.preprocess(video_path, va=False)
|
31 |
+
output = mm_infer(
|
32 |
+
audio_video_tensor,
|
33 |
+
question,
|
34 |
+
model=self.model,
|
35 |
+
tokenizer=self.tokenizer,
|
36 |
+
modal=self.modal,
|
37 |
+
do_sample=False,
|
38 |
+
)
|
39 |
+
|
40 |
+
return output
|
41 |
+
|
42 |
+
|
43 |
+
def run_step2(self, video_audio_path, modal_type='av'):
|
44 |
+
question = f"Given a video and its corresponding audio, determine whether the audio contains voice-over? Options: A. Yes, B. No. Choose A or B."
|
45 |
+
if modal_type == "a":
|
46 |
+
self.model.model.vision_tower = None
|
47 |
+
elif modal_type == "v":
|
48 |
+
self.model.model.audio_tower = None
|
49 |
+
elif modal_type == "av":
|
50 |
+
pass
|
51 |
+
else:
|
52 |
+
raise NotImplementedError
|
53 |
+
audio_video_tensor = self.preprocess(video_audio_path, va=True)
|
54 |
+
output = mm_infer(
|
55 |
+
audio_video_tensor,
|
56 |
+
question,
|
57 |
+
model=self.model,
|
58 |
+
tokenizer=self.tokenizer,
|
59 |
+
modal=self.modal,
|
60 |
+
do_sample=False,
|
61 |
+
)
|
62 |
+
|
63 |
+
if self.step2_mode == "cot":
|
64 |
+
output = output.split("<CONCLUSION>")[-1][1]
|
65 |
+
print("1111111111111111111111111: ", output)
|
66 |
+
output = (output == "A")
|
67 |
+
|
68 |
+
if output:
|
69 |
+
self.log.info(f"The video generated by Step1 ({video_audio_path}) contains voice-over.")
|
70 |
+
else:
|
71 |
+
self.log.info(f"The video generated by Step1 ({video_audio_path}) does not contain voice-over.")
|
72 |
+
self.log.info("Finish Step2 successfully.\n")
|
73 |
+
return output
|