Spaces:
Paused
Paused
clean up frontend
Browse files- .gitignore +3 -0
- app.py +40 -32
- pipeline.py +9 -32
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*.aac
|
2 |
+
*.wav
|
3 |
+
*.pyc
|
app.py
CHANGED
@@ -15,6 +15,9 @@ from pipeline import translation_hdr, translation_url, LANG
|
|
15 |
async def process_video_translation(
|
16 |
input_video, speaker, progress=gr.Progress(track_tqdm=True)
|
17 |
):
|
|
|
|
|
|
|
18 |
total_stages = 6
|
19 |
output_video = f"{input_video.split('.')[0]}_translated.mp4"
|
20 |
with tqdm(total=total_stages, desc="Processing video translation") as pbar:
|
@@ -24,8 +27,8 @@ async def process_video_translation(
|
|
24 |
|
25 |
# transcribe audio
|
26 |
pbar.set_description("Transcribing audio")
|
27 |
-
pbar.update(1)
|
28 |
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
|
|
29 |
|
30 |
# translate to twi
|
31 |
pbar.set_description("Translating to Twi")
|
@@ -62,8 +65,12 @@ async def process_video_translation(
|
|
62 |
return output_video
|
63 |
|
64 |
|
|
|
|
|
|
|
|
|
65 |
with gr.Blocks(
|
66 |
-
theme=
|
67 |
title="Video Dubbing Interface",
|
68 |
) as demo:
|
69 |
with gr.Row(variant="default"):
|
@@ -74,62 +81,63 @@ with gr.Blocks(
|
|
74 |
gr.Image(
|
75 |
"logo_2.jpeg",
|
76 |
show_label=False,
|
77 |
-
|
78 |
-
height=150,
|
79 |
show_download_button=False,
|
80 |
show_fullscreen_button=False,
|
81 |
container=False,
|
|
|
82 |
)
|
83 |
with gr.Column(
|
84 |
-
scale=
|
|
|
85 |
):
|
86 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
with gr.Column(
|
88 |
scale=1,
|
89 |
min_width=0,
|
90 |
):
|
91 |
gr.Image(
|
92 |
-
"
|
93 |
show_label=False,
|
94 |
-
|
95 |
-
height=150,
|
96 |
show_download_button=False,
|
97 |
show_fullscreen_button=False,
|
98 |
container=False,
|
|
|
99 |
)
|
|
|
100 |
|
101 |
-
|
102 |
-
with gr.Row():
|
103 |
-
input_video = gr.Video(label="Input Video", sources=["upload"])
|
104 |
-
input_speaker = gr.Radio(
|
105 |
-
label="Select Speaker",
|
106 |
-
choices=["male", "female"],
|
107 |
-
value="female",
|
108 |
-
min_width=50,
|
109 |
-
container=True,
|
110 |
-
)
|
111 |
-
output_video = gr.Video(label="Processed Video")
|
112 |
|
|
|
113 |
with gr.Row():
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
submit.click(
|
118 |
process_video_translation,
|
119 |
inputs=[input_video, input_speaker],
|
120 |
outputs=output_video,
|
121 |
)
|
122 |
|
|
|
123 |
|
124 |
-
# # Define the Gradio interface
|
125 |
-
# interface = gr.Interface(
|
126 |
-
# fn=process_video_translation, # Function to process the video
|
127 |
-
# inputs=gr.Video(label="Input Video"), # Video file input
|
128 |
-
# outputs=gr.Video(label="Processed Video"), # Video file output
|
129 |
-
# title="Video Processing Interface",
|
130 |
-
# description="Upload a video, and the processed video will be returned.",
|
131 |
-
# theme="light",
|
132 |
-
# )
|
133 |
|
134 |
# Launch the interface
|
135 |
demo.launch(debug=True)
|
|
|
15 |
async def process_video_translation(
|
16 |
input_video, speaker, progress=gr.Progress(track_tqdm=True)
|
17 |
):
|
18 |
+
if input_video is None:
|
19 |
+
gr.Info("Please upload a video file", duration=2)
|
20 |
+
return
|
21 |
total_stages = 6
|
22 |
output_video = f"{input_video.split('.')[0]}_translated.mp4"
|
23 |
with tqdm(total=total_stages, desc="Processing video translation") as pbar:
|
|
|
27 |
|
28 |
# transcribe audio
|
29 |
pbar.set_description("Transcribing audio")
|
|
|
30 |
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
31 |
+
pbar.update(1)
|
32 |
|
33 |
# translate to twi
|
34 |
pbar.set_description("Translating to Twi")
|
|
|
65 |
return output_video
|
66 |
|
67 |
|
68 |
+
app_theme = gr.themes.Ocean(
|
69 |
+
text_size="lg",
|
70 |
+
spacing_size="lg",
|
71 |
+
)
|
72 |
with gr.Blocks(
|
73 |
+
theme=app_theme,
|
74 |
title="Video Dubbing Interface",
|
75 |
) as demo:
|
76 |
with gr.Row(variant="default"):
|
|
|
81 |
gr.Image(
|
82 |
"logo_2.jpeg",
|
83 |
show_label=False,
|
84 |
+
height=200,
|
|
|
85 |
show_download_button=False,
|
86 |
show_fullscreen_button=False,
|
87 |
container=False,
|
88 |
+
show_share_button=False,
|
89 |
)
|
90 |
with gr.Column(
|
91 |
+
scale=6,
|
92 |
+
variant="default",
|
93 |
):
|
94 |
+
gr.HTML(
|
95 |
+
"""
|
96 |
+
<h1 style="font-size: 4em; font-weight: bold; margin-top: 0.5em; margin-left:3em">
|
97 |
+
Video Dubbing Interface
|
98 |
+
</h1>
|
99 |
+
|
100 |
+
""",
|
101 |
+
)
|
102 |
with gr.Column(
|
103 |
scale=1,
|
104 |
min_width=0,
|
105 |
):
|
106 |
gr.Image(
|
107 |
+
"NLPGhana_logo_2.png",
|
108 |
show_label=False,
|
109 |
+
height=200,
|
|
|
110 |
show_download_button=False,
|
111 |
show_fullscreen_button=False,
|
112 |
container=False,
|
113 |
+
show_share_button=False,
|
114 |
)
|
115 |
+
gr.HTML("<hr style='margin-top: 0.5em;'>")
|
116 |
|
117 |
+
gr.HTML("<div style='height: 20px;'></div>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
+
# main interface components
|
120 |
with gr.Row():
|
121 |
+
with gr.Column():
|
122 |
+
input_video = gr.Video(label="Input Video", sources=["upload"], height=400)
|
123 |
+
input_speaker = gr.Radio(
|
124 |
+
label="Select Speaker",
|
125 |
+
choices=["male", "female"],
|
126 |
+
value="female",
|
127 |
+
min_width=50,
|
128 |
+
container=True,
|
129 |
+
show_label=True,
|
130 |
+
)
|
131 |
+
submit = gr.Button("Process Video", scale=1)
|
132 |
+
output_video = gr.Video(label="Processed Video", height=400)
|
133 |
submit.click(
|
134 |
process_video_translation,
|
135 |
inputs=[input_video, input_speaker],
|
136 |
outputs=output_video,
|
137 |
)
|
138 |
|
139 |
+
gr.HTML("<div style='height: 10px;'></div>")
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
# Launch the interface
|
143 |
demo.launch(debug=True)
|
pipeline.py
CHANGED
@@ -1,8 +1,3 @@
|
|
1 |
-
# %%
|
2 |
-
|
3 |
-
# %load_ext autoreload
|
4 |
-
# %autoreload 2
|
5 |
-
|
6 |
from transformers import pipeline
|
7 |
import re
|
8 |
from num2words import num2words
|
@@ -15,6 +10,7 @@ import os
|
|
15 |
from dotenv import load_dotenv
|
16 |
import requests
|
17 |
import ffmpeg
|
|
|
18 |
|
19 |
|
20 |
# load khaya token from environment
|
@@ -34,6 +30,9 @@ translation_hdr = {
|
|
34 |
|
35 |
LANG = "tw"
|
36 |
|
|
|
|
|
|
|
37 |
|
38 |
def replace_numbers_with_words(text):
|
39 |
def replace(match):
|
@@ -119,9 +118,6 @@ async def tts_main(khaya_translations, speaker, list_of_output_chunks):
|
|
119 |
await f
|
120 |
|
121 |
|
122 |
-
# %%
|
123 |
-
|
124 |
-
# filename = "CoolVision-Uzbekistan.mov"
|
125 |
output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
|
126 |
input_video = "test_input_video.mov"
|
127 |
input_audio = "input_audio.aac"
|
@@ -130,9 +126,6 @@ output_video = "test_output_video.mp4"
|
|
130 |
filename_with_path = f"{output_path}/{input_video}"
|
131 |
|
132 |
|
133 |
-
# %%
|
134 |
-
# only need to run this once
|
135 |
-
# !ffmpeg -i {output_path}/{input_video} -vn -acodec copy {output_path}/{input_audio} -y
|
136 |
def extract_audio_from_video(input_video):
|
137 |
if input_video:
|
138 |
output_audio_path = f"separated_audio.aac"
|
@@ -149,11 +142,11 @@ def extract_audio_from_video(input_video):
|
|
149 |
raise e
|
150 |
|
151 |
|
152 |
-
# %%
|
153 |
-
# ASR pipeline
|
154 |
def transcribe_and_preprocess_audio(input_audio):
|
155 |
asr = pipeline(
|
156 |
-
"automatic-speech-recognition",
|
|
|
|
|
157 |
)
|
158 |
pipeline_whisper_output = asr(
|
159 |
f"{input_audio}",
|
@@ -169,8 +162,6 @@ def transcribe_and_preprocess_audio(input_audio):
|
|
169 |
return sentences
|
170 |
|
171 |
|
172 |
-
# %%
|
173 |
-
# combine the audio files
|
174 |
def combine_audio_streams(list_of_output_chunks, output_audio):
|
175 |
input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
|
176 |
concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
|
@@ -182,12 +173,10 @@ def combine_audio_streams(list_of_output_chunks, output_audio):
|
|
182 |
print(e.stderr.decode())
|
183 |
|
184 |
|
185 |
-
# %%
|
186 |
-
# combine the audio and video
|
187 |
def create_combined_output(input_video, output_audio, output_video):
|
188 |
try:
|
189 |
video = ffmpeg.input(f"{input_video}")
|
190 |
-
audio = ffmpeg.input(f"{output_audio}")
|
191 |
(
|
192 |
ffmpeg.output(
|
193 |
video["v"],
|
@@ -200,9 +189,7 @@ def create_combined_output(input_video, output_audio, output_video):
|
|
200 |
return output_video
|
201 |
except ffmpeg.Error as e:
|
202 |
print(e.stderr.decode())
|
203 |
-
|
204 |
-
|
205 |
-
# %%
|
206 |
|
207 |
|
208 |
async def process_video_translation(input_video, output_video):
|
@@ -241,13 +228,3 @@ async def process_video_translation(input_video, output_video):
|
|
241 |
print("Video translation completed")
|
242 |
|
243 |
return output_video
|
244 |
-
|
245 |
-
|
246 |
-
# %%
|
247 |
-
# test_input_video = "../Examples/test_pipeline/test_input_video.mov"
|
248 |
-
# test_output_video = "test_output_video.mp4"
|
249 |
-
|
250 |
-
|
251 |
-
# await process_video_translation(test_input_video, test_output_video)
|
252 |
-
|
253 |
-
# %%
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import pipeline
|
2 |
import re
|
3 |
from num2words import num2words
|
|
|
10 |
from dotenv import load_dotenv
|
11 |
import requests
|
12 |
import ffmpeg
|
13 |
+
import torch
|
14 |
|
15 |
|
16 |
# load khaya token from environment
|
|
|
30 |
|
31 |
LANG = "tw"
|
32 |
|
33 |
+
# Check if GPU is available
|
34 |
+
pipe_device = 0 if torch.cuda.is_available() else -1
|
35 |
+
|
36 |
|
37 |
def replace_numbers_with_words(text):
|
38 |
def replace(match):
|
|
|
118 |
await f
|
119 |
|
120 |
|
|
|
|
|
|
|
121 |
output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
|
122 |
input_video = "test_input_video.mov"
|
123 |
input_audio = "input_audio.aac"
|
|
|
126 |
filename_with_path = f"{output_path}/{input_video}"
|
127 |
|
128 |
|
|
|
|
|
|
|
129 |
def extract_audio_from_video(input_video):
|
130 |
if input_video:
|
131 |
output_audio_path = f"separated_audio.aac"
|
|
|
142 |
raise e
|
143 |
|
144 |
|
|
|
|
|
145 |
def transcribe_and_preprocess_audio(input_audio):
|
146 |
asr = pipeline(
|
147 |
+
"automatic-speech-recognition",
|
148 |
+
model="openai/whisper-large-v3",
|
149 |
+
device=pipe_device,
|
150 |
)
|
151 |
pipeline_whisper_output = asr(
|
152 |
f"{input_audio}",
|
|
|
162 |
return sentences
|
163 |
|
164 |
|
|
|
|
|
165 |
def combine_audio_streams(list_of_output_chunks, output_audio):
|
166 |
input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
|
167 |
concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
|
|
|
173 |
print(e.stderr.decode())
|
174 |
|
175 |
|
|
|
|
|
176 |
def create_combined_output(input_video, output_audio, output_video):
|
177 |
try:
|
178 |
video = ffmpeg.input(f"{input_video}")
|
179 |
+
audio = ffmpeg.input(f"{output_audio}")
|
180 |
(
|
181 |
ffmpeg.output(
|
182 |
video["v"],
|
|
|
189 |
return output_video
|
190 |
except ffmpeg.Error as e:
|
191 |
print(e.stderr.decode())
|
192 |
+
raise e
|
|
|
|
|
193 |
|
194 |
|
195 |
async def process_video_translation(input_video, output_video):
|
|
|
228 |
print("Video translation completed")
|
229 |
|
230 |
return output_video
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|