Spaces:

Ghana-NLP
/

demo-dubbing

Paused

App Files Files Community

Lagyamfi commited on Nov 12, 2024

Commit

fb7e4f3

1 Parent(s): ccfcfff

clean up frontend

Browse files

Files changed (3) hide show

.gitignore +3 -0
app.py +40 -32
pipeline.py +9 -32

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.aac
+*.wav
+*.pyc

app.py CHANGED Viewed

@@ -15,6 +15,9 @@ from pipeline import translation_hdr, translation_url, LANG
 async def process_video_translation(
     input_video, speaker, progress=gr.Progress(track_tqdm=True)
 ):
     total_stages = 6
     output_video = f"{input_video.split('.')[0]}_translated.mp4"
     with tqdm(total=total_stages, desc="Processing video translation") as pbar:
@@ -24,8 +27,8 @@ async def process_video_translation(
         # transcribe audio
         pbar.set_description("Transcribing audio")
-        pbar.update(1)
         sentences = transcribe_and_preprocess_audio(output_audio_path)
         # translate to twi
         pbar.set_description("Translating to Twi")
@@ -62,8 +65,12 @@ async def process_video_translation(
         return output_video
 with gr.Blocks(
-    theme=gr.themes.Soft(),
     title="Video Dubbing Interface",
 ) as demo:
     with gr.Row(variant="default"):
@@ -74,62 +81,63 @@ with gr.Blocks(
             gr.Image(
                 "logo_2.jpeg",
                 show_label=False,
-                width=150,
-                height=150,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
             )
         with gr.Column(
-            scale=2,
         ):
-            gr.Markdown("# Video Dubbing Interface", height=100)
         with gr.Column(
             scale=1,
             min_width=0,
         ):
             gr.Image(
-                "NLPGhana_logo.png",
                 show_label=False,
-                width=50,
-                height=150,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
             )
-    # main interface components
-    with gr.Row():
-        input_video = gr.Video(label="Input Video", sources=["upload"])
-        input_speaker = gr.Radio(
-            label="Select Speaker",
-            choices=["male", "female"],
-            value="female",
-            min_width=50,
-            container=True,
-        )
-        output_video = gr.Video(label="Processed Video")
     with gr.Row():
-        # process video translation
-        submit = gr.Button("Process Video", scale=1)
         submit.click(
             process_video_translation,
             inputs=[input_video, input_speaker],
             outputs=output_video,
         )
-# # Define the Gradio interface
-# interface = gr.Interface(
-#     fn=process_video_translation,  # Function to process the video
-#     inputs=gr.Video(label="Input Video"),  # Video file input
-#     outputs=gr.Video(label="Processed Video"),  # Video file output
-#     title="Video Processing Interface",
-#     description="Upload a video, and the processed video will be returned.",
-#     theme="light",
-# )
 # Launch the interface
 demo.launch(debug=True)

 async def process_video_translation(
     input_video, speaker, progress=gr.Progress(track_tqdm=True)
 ):
+    if input_video is None:
+        gr.Info("Please upload a video file", duration=2)
+        return
     total_stages = 6
     output_video = f"{input_video.split('.')[0]}_translated.mp4"
     with tqdm(total=total_stages, desc="Processing video translation") as pbar:
         # transcribe audio
         pbar.set_description("Transcribing audio")
         sentences = transcribe_and_preprocess_audio(output_audio_path)
+        pbar.update(1)
         # translate to twi
         pbar.set_description("Translating to Twi")
         return output_video
+app_theme = gr.themes.Ocean(
+    text_size="lg",
+    spacing_size="lg",
+)
 with gr.Blocks(
+    theme=app_theme,
     title="Video Dubbing Interface",
 ) as demo:
     with gr.Row(variant="default"):
             gr.Image(
                 "logo_2.jpeg",
                 show_label=False,
+                height=200,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
+                show_share_button=False,
             )
         with gr.Column(
+            scale=6,
+            variant="default",
         ):
+            gr.HTML(
+                """
+                <h1 style="font-size: 4em; font-weight: bold; margin-top: 0.5em; margin-left:3em">
+                    Video Dubbing Interface
+                </h1>
+                """,
+            )
         with gr.Column(
             scale=1,
             min_width=0,
         ):
             gr.Image(
+                "NLPGhana_logo_2.png",
                 show_label=False,
+                height=200,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
+                show_share_button=False,
             )
+    gr.HTML("<hr style='margin-top: 0.5em;'>")
+    gr.HTML("<div style='height: 20px;'></div>")
+    # main interface components
     with gr.Row():
+        with gr.Column():
+            input_video = gr.Video(label="Input Video", sources=["upload"], height=400)
+            input_speaker = gr.Radio(
+                label="Select Speaker",
+                choices=["male", "female"],
+                value="female",
+                min_width=50,
+                container=True,
+                show_label=True,
+            )
+            submit = gr.Button("Process Video", scale=1)
+        output_video = gr.Video(label="Processed Video", height=400)
         submit.click(
             process_video_translation,
             inputs=[input_video, input_speaker],
             outputs=output_video,
         )
+    gr.HTML("<div style='height: 10px;'></div>")
 # Launch the interface
 demo.launch(debug=True)

pipeline.py CHANGED Viewed

@@ -1,8 +1,3 @@
-# %%
-# %load_ext autoreload
-# %autoreload 2
 from transformers import pipeline
 import re
 from num2words import num2words
@@ -15,6 +10,7 @@ import os
 from dotenv import load_dotenv
 import requests
 import ffmpeg
 # load khaya token from environment
@@ -34,6 +30,9 @@ translation_hdr = {
 LANG = "tw"
 def replace_numbers_with_words(text):
     def replace(match):
@@ -119,9 +118,6 @@ async def tts_main(khaya_translations, speaker, list_of_output_chunks):
             await f
-# %%
-# filename = "CoolVision-Uzbekistan.mov"
 output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
 input_video = "test_input_video.mov"
 input_audio = "input_audio.aac"
@@ -130,9 +126,6 @@ output_video = "test_output_video.mp4"
 filename_with_path = f"{output_path}/{input_video}"
-# %%
-# only need to run this once
-# !ffmpeg -i {output_path}/{input_video} -vn -acodec copy {output_path}/{input_audio} -y
 def extract_audio_from_video(input_video):
     if input_video:
         output_audio_path = f"separated_audio.aac"
@@ -149,11 +142,11 @@ def extract_audio_from_video(input_video):
             raise e
-# %%
-# ASR pipeline
 def transcribe_and_preprocess_audio(input_audio):
     asr = pipeline(
-        "automatic-speech-recognition", model="openai/whisper-large-v3", device=0
     )
     pipeline_whisper_output = asr(
         f"{input_audio}",
@@ -169,8 +162,6 @@ def transcribe_and_preprocess_audio(input_audio):
     return sentences
-# %%
-# combine the audio files
 def combine_audio_streams(list_of_output_chunks, output_audio):
     input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
     concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
@@ -182,12 +173,10 @@ def combine_audio_streams(list_of_output_chunks, output_audio):
         print(e.stderr.decode())
-# %%
-# combine the audio and video
 def create_combined_output(input_video, output_audio, output_video):
     try:
         video = ffmpeg.input(f"{input_video}")
-        audio = ffmpeg.input(f"{output_audio}")  # .filter_('atempo', 1.09580838323)
         (
             ffmpeg.output(
                 video["v"],
@@ -200,9 +189,7 @@ def create_combined_output(input_video, output_audio, output_video):
         return output_video
     except ffmpeg.Error as e:
         print(e.stderr.decode())
-# %%
 async def process_video_translation(input_video, output_video):
@@ -241,13 +228,3 @@ async def process_video_translation(input_video, output_video):
     print("Video translation completed")
     return output_video
-# %%
-# test_input_video = "../Examples/test_pipeline/test_input_video.mov"
-# test_output_video = "test_output_video.mp4"
-# await process_video_translation(test_input_video, test_output_video)
-# %%

 from transformers import pipeline
 import re
 from num2words import num2words
 from dotenv import load_dotenv
 import requests
 import ffmpeg
+import torch
 # load khaya token from environment
 LANG = "tw"
+# Check if GPU is available
+pipe_device = 0 if torch.cuda.is_available() else -1
 def replace_numbers_with_words(text):
     def replace(match):
             await f
 output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
 input_video = "test_input_video.mov"
 input_audio = "input_audio.aac"
 filename_with_path = f"{output_path}/{input_video}"
 def extract_audio_from_video(input_video):
     if input_video:
         output_audio_path = f"separated_audio.aac"
             raise e
 def transcribe_and_preprocess_audio(input_audio):
     asr = pipeline(
+        "automatic-speech-recognition",
+        model="openai/whisper-large-v3",
+        device=pipe_device,
     )
     pipeline_whisper_output = asr(
         f"{input_audio}",
     return sentences
 def combine_audio_streams(list_of_output_chunks, output_audio):
     input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
     concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
         print(e.stderr.decode())
 def create_combined_output(input_video, output_audio, output_video):
     try:
         video = ffmpeg.input(f"{input_video}")
+        audio = ffmpeg.input(f"{output_audio}")
         (
             ffmpeg.output(
                 video["v"],
         return output_video
     except ffmpeg.Error as e:
         print(e.stderr.decode())
+        raise e
 async def process_video_translation(input_video, output_video):
     print("Video translation completed")
     return output_video