Lagyamfi commited on
Commit
fb7e4f3
·
1 Parent(s): ccfcfff

clean up frontend

Browse files
Files changed (3) hide show
  1. .gitignore +3 -0
  2. app.py +40 -32
  3. pipeline.py +9 -32
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.aac
2
+ *.wav
3
+ *.pyc
app.py CHANGED
@@ -15,6 +15,9 @@ from pipeline import translation_hdr, translation_url, LANG
15
  async def process_video_translation(
16
  input_video, speaker, progress=gr.Progress(track_tqdm=True)
17
  ):
 
 
 
18
  total_stages = 6
19
  output_video = f"{input_video.split('.')[0]}_translated.mp4"
20
  with tqdm(total=total_stages, desc="Processing video translation") as pbar:
@@ -24,8 +27,8 @@ async def process_video_translation(
24
 
25
  # transcribe audio
26
  pbar.set_description("Transcribing audio")
27
- pbar.update(1)
28
  sentences = transcribe_and_preprocess_audio(output_audio_path)
 
29
 
30
  # translate to twi
31
  pbar.set_description("Translating to Twi")
@@ -62,8 +65,12 @@ async def process_video_translation(
62
  return output_video
63
 
64
 
 
 
 
 
65
  with gr.Blocks(
66
- theme=gr.themes.Soft(),
67
  title="Video Dubbing Interface",
68
  ) as demo:
69
  with gr.Row(variant="default"):
@@ -74,62 +81,63 @@ with gr.Blocks(
74
  gr.Image(
75
  "logo_2.jpeg",
76
  show_label=False,
77
- width=150,
78
- height=150,
79
  show_download_button=False,
80
  show_fullscreen_button=False,
81
  container=False,
 
82
  )
83
  with gr.Column(
84
- scale=2,
 
85
  ):
86
- gr.Markdown("# Video Dubbing Interface", height=100)
 
 
 
 
 
 
 
87
  with gr.Column(
88
  scale=1,
89
  min_width=0,
90
  ):
91
  gr.Image(
92
- "NLPGhana_logo.png",
93
  show_label=False,
94
- width=50,
95
- height=150,
96
  show_download_button=False,
97
  show_fullscreen_button=False,
98
  container=False,
 
99
  )
 
100
 
101
- # main interface components
102
- with gr.Row():
103
- input_video = gr.Video(label="Input Video", sources=["upload"])
104
- input_speaker = gr.Radio(
105
- label="Select Speaker",
106
- choices=["male", "female"],
107
- value="female",
108
- min_width=50,
109
- container=True,
110
- )
111
- output_video = gr.Video(label="Processed Video")
112
 
 
113
  with gr.Row():
114
-
115
- # process video translation
116
- submit = gr.Button("Process Video", scale=1)
 
 
 
 
 
 
 
 
 
117
  submit.click(
118
  process_video_translation,
119
  inputs=[input_video, input_speaker],
120
  outputs=output_video,
121
  )
122
 
 
123
 
124
- # # Define the Gradio interface
125
- # interface = gr.Interface(
126
- # fn=process_video_translation, # Function to process the video
127
- # inputs=gr.Video(label="Input Video"), # Video file input
128
- # outputs=gr.Video(label="Processed Video"), # Video file output
129
- # title="Video Processing Interface",
130
- # description="Upload a video, and the processed video will be returned.",
131
- # theme="light",
132
- # )
133
 
134
  # Launch the interface
135
  demo.launch(debug=True)
 
15
  async def process_video_translation(
16
  input_video, speaker, progress=gr.Progress(track_tqdm=True)
17
  ):
18
+ if input_video is None:
19
+ gr.Info("Please upload a video file", duration=2)
20
+ return
21
  total_stages = 6
22
  output_video = f"{input_video.split('.')[0]}_translated.mp4"
23
  with tqdm(total=total_stages, desc="Processing video translation") as pbar:
 
27
 
28
  # transcribe audio
29
  pbar.set_description("Transcribing audio")
 
30
  sentences = transcribe_and_preprocess_audio(output_audio_path)
31
+ pbar.update(1)
32
 
33
  # translate to twi
34
  pbar.set_description("Translating to Twi")
 
65
  return output_video
66
 
67
 
68
+ app_theme = gr.themes.Ocean(
69
+ text_size="lg",
70
+ spacing_size="lg",
71
+ )
72
  with gr.Blocks(
73
+ theme=app_theme,
74
  title="Video Dubbing Interface",
75
  ) as demo:
76
  with gr.Row(variant="default"):
 
81
  gr.Image(
82
  "logo_2.jpeg",
83
  show_label=False,
84
+ height=200,
 
85
  show_download_button=False,
86
  show_fullscreen_button=False,
87
  container=False,
88
+ show_share_button=False,
89
  )
90
  with gr.Column(
91
+ scale=6,
92
+ variant="default",
93
  ):
94
+ gr.HTML(
95
+ """
96
+ <h1 style="font-size: 4em; font-weight: bold; margin-top: 0.5em; margin-left:3em">
97
+ Video Dubbing Interface
98
+ </h1>
99
+
100
+ """,
101
+ )
102
  with gr.Column(
103
  scale=1,
104
  min_width=0,
105
  ):
106
  gr.Image(
107
+ "NLPGhana_logo_2.png",
108
  show_label=False,
109
+ height=200,
 
110
  show_download_button=False,
111
  show_fullscreen_button=False,
112
  container=False,
113
+ show_share_button=False,
114
  )
115
+ gr.HTML("<hr style='margin-top: 0.5em;'>")
116
 
117
+ gr.HTML("<div style='height: 20px;'></div>")
 
 
 
 
 
 
 
 
 
 
118
 
119
+ # main interface components
120
  with gr.Row():
121
+ with gr.Column():
122
+ input_video = gr.Video(label="Input Video", sources=["upload"], height=400)
123
+ input_speaker = gr.Radio(
124
+ label="Select Speaker",
125
+ choices=["male", "female"],
126
+ value="female",
127
+ min_width=50,
128
+ container=True,
129
+ show_label=True,
130
+ )
131
+ submit = gr.Button("Process Video", scale=1)
132
+ output_video = gr.Video(label="Processed Video", height=400)
133
  submit.click(
134
  process_video_translation,
135
  inputs=[input_video, input_speaker],
136
  outputs=output_video,
137
  )
138
 
139
+ gr.HTML("<div style='height: 10px;'></div>")
140
 
 
 
 
 
 
 
 
 
 
141
 
142
  # Launch the interface
143
  demo.launch(debug=True)
pipeline.py CHANGED
@@ -1,8 +1,3 @@
1
- # %%
2
-
3
- # %load_ext autoreload
4
- # %autoreload 2
5
-
6
  from transformers import pipeline
7
  import re
8
  from num2words import num2words
@@ -15,6 +10,7 @@ import os
15
  from dotenv import load_dotenv
16
  import requests
17
  import ffmpeg
 
18
 
19
 
20
  # load khaya token from environment
@@ -34,6 +30,9 @@ translation_hdr = {
34
 
35
  LANG = "tw"
36
 
 
 
 
37
 
38
  def replace_numbers_with_words(text):
39
  def replace(match):
@@ -119,9 +118,6 @@ async def tts_main(khaya_translations, speaker, list_of_output_chunks):
119
  await f
120
 
121
 
122
- # %%
123
-
124
- # filename = "CoolVision-Uzbekistan.mov"
125
  output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
126
  input_video = "test_input_video.mov"
127
  input_audio = "input_audio.aac"
@@ -130,9 +126,6 @@ output_video = "test_output_video.mp4"
130
  filename_with_path = f"{output_path}/{input_video}"
131
 
132
 
133
- # %%
134
- # only need to run this once
135
- # !ffmpeg -i {output_path}/{input_video} -vn -acodec copy {output_path}/{input_audio} -y
136
  def extract_audio_from_video(input_video):
137
  if input_video:
138
  output_audio_path = f"separated_audio.aac"
@@ -149,11 +142,11 @@ def extract_audio_from_video(input_video):
149
  raise e
150
 
151
 
152
- # %%
153
- # ASR pipeline
154
  def transcribe_and_preprocess_audio(input_audio):
155
  asr = pipeline(
156
- "automatic-speech-recognition", model="openai/whisper-large-v3", device=0
 
 
157
  )
158
  pipeline_whisper_output = asr(
159
  f"{input_audio}",
@@ -169,8 +162,6 @@ def transcribe_and_preprocess_audio(input_audio):
169
  return sentences
170
 
171
 
172
- # %%
173
- # combine the audio files
174
  def combine_audio_streams(list_of_output_chunks, output_audio):
175
  input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
176
  concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
@@ -182,12 +173,10 @@ def combine_audio_streams(list_of_output_chunks, output_audio):
182
  print(e.stderr.decode())
183
 
184
 
185
- # %%
186
- # combine the audio and video
187
  def create_combined_output(input_video, output_audio, output_video):
188
  try:
189
  video = ffmpeg.input(f"{input_video}")
190
- audio = ffmpeg.input(f"{output_audio}") # .filter_('atempo', 1.09580838323)
191
  (
192
  ffmpeg.output(
193
  video["v"],
@@ -200,9 +189,7 @@ def create_combined_output(input_video, output_audio, output_video):
200
  return output_video
201
  except ffmpeg.Error as e:
202
  print(e.stderr.decode())
203
-
204
-
205
- # %%
206
 
207
 
208
  async def process_video_translation(input_video, output_video):
@@ -241,13 +228,3 @@ async def process_video_translation(input_video, output_video):
241
  print("Video translation completed")
242
 
243
  return output_video
244
-
245
-
246
- # %%
247
- # test_input_video = "../Examples/test_pipeline/test_input_video.mov"
248
- # test_output_video = "test_output_video.mp4"
249
-
250
-
251
- # await process_video_translation(test_input_video, test_output_video)
252
-
253
- # %%
 
 
 
 
 
 
1
  from transformers import pipeline
2
  import re
3
  from num2words import num2words
 
10
  from dotenv import load_dotenv
11
  import requests
12
  import ffmpeg
13
+ import torch
14
 
15
 
16
  # load khaya token from environment
 
30
 
31
  LANG = "tw"
32
 
33
+ # Check if GPU is available
34
+ pipe_device = 0 if torch.cuda.is_available() else -1
35
+
36
 
37
  def replace_numbers_with_words(text):
38
  def replace(match):
 
118
  await f
119
 
120
 
 
 
 
121
  output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
122
  input_video = "test_input_video.mov"
123
  input_audio = "input_audio.aac"
 
126
  filename_with_path = f"{output_path}/{input_video}"
127
 
128
 
 
 
 
129
  def extract_audio_from_video(input_video):
130
  if input_video:
131
  output_audio_path = f"separated_audio.aac"
 
142
  raise e
143
 
144
 
 
 
145
  def transcribe_and_preprocess_audio(input_audio):
146
  asr = pipeline(
147
+ "automatic-speech-recognition",
148
+ model="openai/whisper-large-v3",
149
+ device=pipe_device,
150
  )
151
  pipeline_whisper_output = asr(
152
  f"{input_audio}",
 
162
  return sentences
163
 
164
 
 
 
165
  def combine_audio_streams(list_of_output_chunks, output_audio):
166
  input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
167
  concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
 
173
  print(e.stderr.decode())
174
 
175
 
 
 
176
  def create_combined_output(input_video, output_audio, output_video):
177
  try:
178
  video = ffmpeg.input(f"{input_video}")
179
+ audio = ffmpeg.input(f"{output_audio}")
180
  (
181
  ffmpeg.output(
182
  video["v"],
 
189
  return output_video
190
  except ffmpeg.Error as e:
191
  print(e.stderr.decode())
192
+ raise e
 
 
193
 
194
 
195
  async def process_video_translation(input_video, output_video):
 
228
  print("Video translation completed")
229
 
230
  return output_video