leolxliu commited on
Commit
66a8f87
·
1 Parent(s): 8e03bc7
Files changed (1) hide show
  1. app.py +50 -53
app.py CHANGED
@@ -15,7 +15,7 @@ import gradio as gr
15
  from elevenlabs import clone, generate, get_api_key, set_api_key
16
 
17
 
18
- css="""
19
  #col-container{
20
  margin: 0 auto;
21
  max-width: 840px;
@@ -34,8 +34,7 @@ openai.api_version = "2023-05-15"
34
  openai.log = "debug"
35
 
36
 
37
-
38
- #*************************#
39
  # 1. Resize the video #
40
  # 2. Extract the audio #
41
  # 3. Translate the text from audio #
@@ -44,9 +43,8 @@ openai.log = "debug"
44
  # 6. Wave2lip #
45
 
46
 
47
-
48
  start = time.perf_counter()
49
- model = whisper.load_model("base",download_root='./checkpoints')
50
  end = time.perf_counter()
51
 
52
  print('whisper load model time: ', end - start)
@@ -56,29 +54,27 @@ set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
56
  print('elevenlab api key', get_api_key())
57
 
58
  language_mapping = {
59
- 'English':'英语',
60
- 'Spanish':'西班牙语',
61
- 'French': '法语',
62
- 'German': '德语',
63
- 'Italian': '意大利语',
64
- 'Portuguese': '葡萄牙语',
65
- 'Polish': '波兰语',
66
- 'Turkish': '土耳其语',
67
- 'Russian': '俄语',
68
- 'Dutch': '荷兰语',
69
- 'Czech': '捷克语',
70
- 'Arabic': '阿拉伯语',
71
- 'Chinese': '中文普通话'
72
  }
73
 
74
 
75
-
76
  def resize_video(video_source):
77
 
78
  return video_source
79
 
80
 
81
-
82
  def extract_audio(video_source, output_dir='./'):
83
 
84
  output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
@@ -91,60 +87,58 @@ def extract_audio(video_source, output_dir='./'):
91
  print('ffmpeg command: ', ff.cmd)
92
  ff.run()
93
 
94
- return output_audio
95
-
96
 
97
 
98
  def clone_audio(audio_file, audio_text):
99
 
100
  voice = clone(
101
  name=uuid.uuid4().hex,
102
- description="", # Optional
103
  files=[audio_file])
104
-
105
  print('voice: ', voice)
106
- audio = generate(text=audio_text, voice=voice, model='eleven_multilingual_v2')
107
-
 
108
  return audio
109
 
110
 
111
- # todo
112
  def translate_text(text, target_language):
113
 
114
  target_language_name = language_mapping[target_language]
115
 
116
  chat_completion = openai.ChatCompletion.create(
117
- engine="gpt-4",
118
- temperature=0.1,
119
- max_tokens=2048,
120
- messages=[
121
- {"role":"system", "content": default_prompt.replace('{{target_lang}}', target_language_name)},
122
- {"role": "user", "content": text}])
123
-
 
124
  # print the completion
125
  print(chat_completion.choices[0].message.content)
126
 
127
-
128
  translated_text = chat_completion.choices[0].message.content
129
 
130
  return translated_text
131
 
132
 
133
-
134
  def infer(video_source, target_language):
135
 
136
  print('video_source: ', video_source)
137
 
138
- # check the video format
139
-
140
  # Create a temporary directory to store the output file
141
  output_dir = tempfile.mkdtemp()
142
  output_video_file = os.path.join(output_dir, 'output_video.mp4')
143
  print("Output file: ", output_video_file)
144
 
145
  output_audio = extract_audio(video_source, output_dir=output_dir)
146
-
147
-
148
  result = model.transcribe(output_audio)
149
  whisper_text = result["text"]
150
  whisper_language = result['language']
@@ -154,12 +148,12 @@ def infer(video_source, target_language):
154
  target_language_code = language_mapping[target_language]
155
 
156
  print("Target language code: ", target_language_code)
157
-
158
- translated_text = translate_text(whisper_text, target_language)
159
 
160
  print("Translated text: ", translated_text)
161
 
162
- # 声音 clone && 合成
163
  audio = clone_audio(output_audio, translated_text)
164
 
165
  audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
@@ -174,7 +168,7 @@ def infer(video_source, target_language):
174
  subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
175
 
176
  print("Video conversion successful.")
177
-
178
  return output_video_file
179
 
180
 
@@ -191,8 +185,10 @@ with gr.Blocks(css=css) as demo:
191
 
192
  with gr.Row():
193
  with gr.Column():
194
- video_source = gr.Video(label="Source Video", show_label=True,interactive=True)
195
- target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!",value="English")
 
 
196
 
197
  submit_btn = gr.Button(value="Submit")
198
 
@@ -201,10 +197,11 @@ with gr.Blocks(css=css) as demo:
201
 
202
  with gr.Row():
203
  gr.Examples(
204
- label="Video Examples",
205
- examples=['dictator.mp4'],
206
- inputs=[video_source]
207
- )
208
- submit_btn.click(infer, inputs=[video_source,target_language], outputs=result)
209
-
210
- demo.launch()
 
 
15
  from elevenlabs import clone, generate, get_api_key, set_api_key
16
 
17
 
18
+ css = """
19
  #col-container{
20
  margin: 0 auto;
21
  max-width: 840px;
 
34
  openai.log = "debug"
35
 
36
 
37
+ # *************************#
 
38
  # 1. Resize the video #
39
  # 2. Extract the audio #
40
  # 3. Translate the text from audio #
 
43
  # 6. Wave2lip #
44
 
45
 
 
46
  start = time.perf_counter()
47
+ model = whisper.load_model("base", download_root='./checkpoints')
48
  end = time.perf_counter()
49
 
50
  print('whisper load model time: ', end - start)
 
54
  print('elevenlab api key', get_api_key())
55
 
56
  language_mapping = {
57
+ 'English': '英语',
58
+ 'Spanish': '西班牙语',
59
+ 'French': '法语',
60
+ 'German': '德语',
61
+ 'Italian': '意大利语',
62
+ 'Portuguese': '葡萄牙语',
63
+ 'Polish': '波兰语',
64
+ 'Turkish': '土耳其语',
65
+ 'Russian': '俄语',
66
+ 'Dutch': '荷兰语',
67
+ 'Czech': '捷克语',
68
+ 'Arabic': '阿拉伯语',
69
+ 'Chinese': '中文普通话'
70
  }
71
 
72
 
 
73
  def resize_video(video_source):
74
 
75
  return video_source
76
 
77
 
 
78
  def extract_audio(video_source, output_dir='./'):
79
 
80
  output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
 
87
  print('ffmpeg command: ', ff.cmd)
88
  ff.run()
89
 
90
+ return output_audio
 
91
 
92
 
93
  def clone_audio(audio_file, audio_text):
94
 
95
  voice = clone(
96
  name=uuid.uuid4().hex,
97
+ description="", # Optional
98
  files=[audio_file])
99
+
100
  print('voice: ', voice)
101
+ audio = generate(text=audio_text, voice=voice,
102
+ model='eleven_multilingual_v2')
103
+
104
  return audio
105
 
106
 
107
+ # todo
108
  def translate_text(text, target_language):
109
 
110
  target_language_name = language_mapping[target_language]
111
 
112
  chat_completion = openai.ChatCompletion.create(
113
+ engine="gpt-4",
114
+ temperature=0.1,
115
+ max_tokens=2048,
116
+ messages=[
117
+ {"role": "system", "content": default_prompt.replace(
118
+ '{{target_lang}}', target_language_name)},
119
+ {"role": "user", "content": text}])
120
+
121
  # print the completion
122
  print(chat_completion.choices[0].message.content)
123
 
 
124
  translated_text = chat_completion.choices[0].message.content
125
 
126
  return translated_text
127
 
128
 
 
129
  def infer(video_source, target_language):
130
 
131
  print('video_source: ', video_source)
132
 
133
+ # check the video format
134
+
135
  # Create a temporary directory to store the output file
136
  output_dir = tempfile.mkdtemp()
137
  output_video_file = os.path.join(output_dir, 'output_video.mp4')
138
  print("Output file: ", output_video_file)
139
 
140
  output_audio = extract_audio(video_source, output_dir=output_dir)
141
+
 
142
  result = model.transcribe(output_audio)
143
  whisper_text = result["text"]
144
  whisper_language = result['language']
 
148
  target_language_code = language_mapping[target_language]
149
 
150
  print("Target language code: ", target_language_code)
151
+
152
+ translated_text = translate_text(whisper_text, target_language)
153
 
154
  print("Translated text: ", translated_text)
155
 
156
+ # 声音 clone && 合成
157
  audio = clone_audio(output_audio, translated_text)
158
 
159
  audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
 
168
  subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
169
 
170
  print("Video conversion successful.")
171
+
172
  return output_video_file
173
 
174
 
 
185
 
186
  with gr.Row():
187
  with gr.Column():
188
+ video_source = gr.Video(
189
+ label="Source Video", show_label=True, interactive=True)
190
+ target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish",
191
+ "Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English")
192
 
193
  submit_btn = gr.Button(value="Submit")
194
 
 
197
 
198
  with gr.Row():
199
  gr.Examples(
200
+ label="Video Examples",
201
+ examples=['dictator.mp4'],
202
+ inputs=[video_source]
203
+ )
204
+ submit_btn.click(
205
+ infer, inputs=[video_source, target_language], outputs=result)
206
+
207
+ demo.queue(5).launch()