Spaces:
Runtime error
Runtime error
leolxliu
commited on
Commit
·
66a8f87
1
Parent(s):
8e03bc7
add queue
Browse files
app.py
CHANGED
@@ -15,7 +15,7 @@ import gradio as gr
|
|
15 |
from elevenlabs import clone, generate, get_api_key, set_api_key
|
16 |
|
17 |
|
18 |
-
css="""
|
19 |
#col-container{
|
20 |
margin: 0 auto;
|
21 |
max-width: 840px;
|
@@ -34,8 +34,7 @@ openai.api_version = "2023-05-15"
|
|
34 |
openai.log = "debug"
|
35 |
|
36 |
|
37 |
-
|
38 |
-
#*************************#
|
39 |
# 1. Resize the video #
|
40 |
# 2. Extract the audio #
|
41 |
# 3. Translate the text from audio #
|
@@ -44,9 +43,8 @@ openai.log = "debug"
|
|
44 |
# 6. Wave2lip #
|
45 |
|
46 |
|
47 |
-
|
48 |
start = time.perf_counter()
|
49 |
-
model = whisper.load_model("base",download_root='./checkpoints')
|
50 |
end = time.perf_counter()
|
51 |
|
52 |
print('whisper load model time: ', end - start)
|
@@ -56,29 +54,27 @@ set_api_key('05a491535c6526e1fc9fc8e195f2fe25')
|
|
56 |
print('elevenlab api key', get_api_key())
|
57 |
|
58 |
language_mapping = {
|
59 |
-
'English':'英语',
|
60 |
-
'Spanish':'西班牙语',
|
61 |
-
'French': '法语',
|
62 |
-
'German': '德语',
|
63 |
-
'Italian': '意大利语',
|
64 |
-
'Portuguese': '葡萄牙语',
|
65 |
-
'Polish': '波兰语',
|
66 |
-
'Turkish': '土耳其语',
|
67 |
-
'Russian': '俄语',
|
68 |
-
'Dutch': '荷兰语',
|
69 |
-
'Czech': '捷克语',
|
70 |
-
'Arabic': '阿拉伯语',
|
71 |
-
'Chinese': '中文普通话'
|
72 |
}
|
73 |
|
74 |
|
75 |
-
|
76 |
def resize_video(video_source):
|
77 |
|
78 |
return video_source
|
79 |
|
80 |
|
81 |
-
|
82 |
def extract_audio(video_source, output_dir='./'):
|
83 |
|
84 |
output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
|
@@ -91,60 +87,58 @@ def extract_audio(video_source, output_dir='./'):
|
|
91 |
print('ffmpeg command: ', ff.cmd)
|
92 |
ff.run()
|
93 |
|
94 |
-
return output_audio
|
95 |
-
|
96 |
|
97 |
|
98 |
def clone_audio(audio_file, audio_text):
|
99 |
|
100 |
voice = clone(
|
101 |
name=uuid.uuid4().hex,
|
102 |
-
description="",
|
103 |
files=[audio_file])
|
104 |
-
|
105 |
print('voice: ', voice)
|
106 |
-
audio = generate(text=audio_text, voice=voice,
|
107 |
-
|
|
|
108 |
return audio
|
109 |
|
110 |
|
111 |
-
# todo
|
112 |
def translate_text(text, target_language):
|
113 |
|
114 |
target_language_name = language_mapping[target_language]
|
115 |
|
116 |
chat_completion = openai.ChatCompletion.create(
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
124 |
# print the completion
|
125 |
print(chat_completion.choices[0].message.content)
|
126 |
|
127 |
-
|
128 |
translated_text = chat_completion.choices[0].message.content
|
129 |
|
130 |
return translated_text
|
131 |
|
132 |
|
133 |
-
|
134 |
def infer(video_source, target_language):
|
135 |
|
136 |
print('video_source: ', video_source)
|
137 |
|
138 |
-
# check the video format
|
139 |
-
|
140 |
# Create a temporary directory to store the output file
|
141 |
output_dir = tempfile.mkdtemp()
|
142 |
output_video_file = os.path.join(output_dir, 'output_video.mp4')
|
143 |
print("Output file: ", output_video_file)
|
144 |
|
145 |
output_audio = extract_audio(video_source, output_dir=output_dir)
|
146 |
-
|
147 |
-
|
148 |
result = model.transcribe(output_audio)
|
149 |
whisper_text = result["text"]
|
150 |
whisper_language = result['language']
|
@@ -154,12 +148,12 @@ def infer(video_source, target_language):
|
|
154 |
target_language_code = language_mapping[target_language]
|
155 |
|
156 |
print("Target language code: ", target_language_code)
|
157 |
-
|
158 |
-
translated_text =
|
159 |
|
160 |
print("Translated text: ", translated_text)
|
161 |
|
162 |
-
# 声音 clone && 合成
|
163 |
audio = clone_audio(output_audio, translated_text)
|
164 |
|
165 |
audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
|
@@ -174,7 +168,7 @@ def infer(video_source, target_language):
|
|
174 |
subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
|
175 |
|
176 |
print("Video conversion successful.")
|
177 |
-
|
178 |
return output_video_file
|
179 |
|
180 |
|
@@ -191,8 +185,10 @@ with gr.Blocks(css=css) as demo:
|
|
191 |
|
192 |
with gr.Row():
|
193 |
with gr.Column():
|
194 |
-
video_source = gr.Video(
|
195 |
-
|
|
|
|
|
196 |
|
197 |
submit_btn = gr.Button(value="Submit")
|
198 |
|
@@ -201,10 +197,11 @@ with gr.Blocks(css=css) as demo:
|
|
201 |
|
202 |
with gr.Row():
|
203 |
gr.Examples(
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
submit_btn.click(
|
209 |
-
|
210 |
-
|
|
|
|
15 |
from elevenlabs import clone, generate, get_api_key, set_api_key
|
16 |
|
17 |
|
18 |
+
css = """
|
19 |
#col-container{
|
20 |
margin: 0 auto;
|
21 |
max-width: 840px;
|
|
|
34 |
openai.log = "debug"
|
35 |
|
36 |
|
37 |
+
# *************************#
|
|
|
38 |
# 1. Resize the video #
|
39 |
# 2. Extract the audio #
|
40 |
# 3. Translate the text from audio #
|
|
|
43 |
# 6. Wave2lip #
|
44 |
|
45 |
|
|
|
46 |
start = time.perf_counter()
|
47 |
+
model = whisper.load_model("base", download_root='./checkpoints')
|
48 |
end = time.perf_counter()
|
49 |
|
50 |
print('whisper load model time: ', end - start)
|
|
|
54 |
print('elevenlab api key', get_api_key())
|
55 |
|
56 |
language_mapping = {
|
57 |
+
'English': '英语',
|
58 |
+
'Spanish': '西班牙语',
|
59 |
+
'French': '法语',
|
60 |
+
'German': '德语',
|
61 |
+
'Italian': '意大利语',
|
62 |
+
'Portuguese': '葡萄牙语',
|
63 |
+
'Polish': '波兰语',
|
64 |
+
'Turkish': '土耳其语',
|
65 |
+
'Russian': '俄语',
|
66 |
+
'Dutch': '荷兰语',
|
67 |
+
'Czech': '捷克语',
|
68 |
+
'Arabic': '阿拉伯语',
|
69 |
+
'Chinese': '中文普通话'
|
70 |
}
|
71 |
|
72 |
|
|
|
73 |
def resize_video(video_source):
|
74 |
|
75 |
return video_source
|
76 |
|
77 |
|
|
|
78 |
def extract_audio(video_source, output_dir='./'):
|
79 |
|
80 |
output_audio = os.path.join(output_dir, 'output_orignal_audio.wav')
|
|
|
87 |
print('ffmpeg command: ', ff.cmd)
|
88 |
ff.run()
|
89 |
|
90 |
+
return output_audio
|
|
|
91 |
|
92 |
|
93 |
def clone_audio(audio_file, audio_text):
|
94 |
|
95 |
voice = clone(
|
96 |
name=uuid.uuid4().hex,
|
97 |
+
description="", # Optional
|
98 |
files=[audio_file])
|
99 |
+
|
100 |
print('voice: ', voice)
|
101 |
+
audio = generate(text=audio_text, voice=voice,
|
102 |
+
model='eleven_multilingual_v2')
|
103 |
+
|
104 |
return audio
|
105 |
|
106 |
|
107 |
+
# todo
|
108 |
def translate_text(text, target_language):
|
109 |
|
110 |
target_language_name = language_mapping[target_language]
|
111 |
|
112 |
chat_completion = openai.ChatCompletion.create(
|
113 |
+
engine="gpt-4",
|
114 |
+
temperature=0.1,
|
115 |
+
max_tokens=2048,
|
116 |
+
messages=[
|
117 |
+
{"role": "system", "content": default_prompt.replace(
|
118 |
+
'{{target_lang}}', target_language_name)},
|
119 |
+
{"role": "user", "content": text}])
|
120 |
+
|
121 |
# print the completion
|
122 |
print(chat_completion.choices[0].message.content)
|
123 |
|
|
|
124 |
translated_text = chat_completion.choices[0].message.content
|
125 |
|
126 |
return translated_text
|
127 |
|
128 |
|
|
|
129 |
def infer(video_source, target_language):
|
130 |
|
131 |
print('video_source: ', video_source)
|
132 |
|
133 |
+
# check the video format
|
134 |
+
|
135 |
# Create a temporary directory to store the output file
|
136 |
output_dir = tempfile.mkdtemp()
|
137 |
output_video_file = os.path.join(output_dir, 'output_video.mp4')
|
138 |
print("Output file: ", output_video_file)
|
139 |
|
140 |
output_audio = extract_audio(video_source, output_dir=output_dir)
|
141 |
+
|
|
|
142 |
result = model.transcribe(output_audio)
|
143 |
whisper_text = result["text"]
|
144 |
whisper_language = result['language']
|
|
|
148 |
target_language_code = language_mapping[target_language]
|
149 |
|
150 |
print("Target language code: ", target_language_code)
|
151 |
+
|
152 |
+
translated_text = translate_text(whisper_text, target_language)
|
153 |
|
154 |
print("Translated text: ", translated_text)
|
155 |
|
156 |
+
# 声音 clone && 合成
|
157 |
audio = clone_audio(output_audio, translated_text)
|
158 |
|
159 |
audio_file = os.path.join(output_dir, 'output_clone_audio.wav')
|
|
|
168 |
subprocess.run(wav2lip, shell=True, stdout=subprocess.PIPE)
|
169 |
|
170 |
print("Video conversion successful.")
|
171 |
+
|
172 |
return output_video_file
|
173 |
|
174 |
|
|
|
185 |
|
186 |
with gr.Row():
|
187 |
with gr.Column():
|
188 |
+
video_source = gr.Video(
|
189 |
+
label="Source Video", show_label=True, interactive=True)
|
190 |
+
target_language = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish",
|
191 |
+
"Russian", "Dutch", "Czech", "Arabic", "Chinese"], label="Target language", info="Target language!", value="English")
|
192 |
|
193 |
submit_btn = gr.Button(value="Submit")
|
194 |
|
|
|
197 |
|
198 |
with gr.Row():
|
199 |
gr.Examples(
|
200 |
+
label="Video Examples",
|
201 |
+
examples=['dictator.mp4'],
|
202 |
+
inputs=[video_source]
|
203 |
+
)
|
204 |
+
submit_btn.click(
|
205 |
+
infer, inputs=[video_source, target_language], outputs=result)
|
206 |
+
|
207 |
+
demo.queue(5).launch()
|