|
import os |
|
import shutil |
|
from huggingface_hub import snapshot_download |
|
import gradio as gr |
|
from gradio_client import Client, handle_file |
|
from mutagen.mp3 import MP3 |
|
from pydub import AudioSegment |
|
from PIL import Image |
|
import ffmpeg |
|
os.chdir(os.path.dirname(os.path.abspath(__file__))) |
|
from scripts.inference import inference_process |
|
import argparse |
|
import uuid |
|
|
|
is_shared_ui = True if "fffiloni/tts-hallo-talking-portrait" in os.environ['SPACE_ID'] else False |
|
|
|
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") |
|
|
|
AUDIO_MAX_DURATION = 4000 |
|
|
|
|
|
|
|
|
|
|
|
def is_mp3(file_path): |
|
try: |
|
audio = MP3(file_path) |
|
return True |
|
except Exception as e: |
|
return False |
|
|
|
def convert_mp3_to_wav(mp3_file_path, wav_file_path): |
|
|
|
audio = AudioSegment.from_mp3(mp3_file_path) |
|
|
|
audio.export(wav_file_path, format="wav") |
|
return wav_file_path |
|
|
|
|
|
def trim_audio(file_path, output_path, max_duration): |
|
|
|
audio = AudioSegment.from_wav(file_path) |
|
|
|
|
|
audio_length = len(audio) |
|
|
|
|
|
if audio_length > max_duration: |
|
trimmed_audio = audio[:max_duration] |
|
else: |
|
trimmed_audio = audio |
|
|
|
|
|
trimmed_audio.export(output_path, format="wav") |
|
|
|
return output_path |
|
|
|
|
|
def add_silence_to_wav(wav_file_path, duration_s=1): |
|
|
|
audio = AudioSegment.from_wav(wav_file_path) |
|
|
|
silence = AudioSegment.silent(duration=duration_s * 1000) |
|
|
|
audio_with_silence = audio + silence |
|
|
|
audio_with_silence.export(wav_file_path, format="wav") |
|
return wav_file_path |
|
|
|
def check_mp3(file_path): |
|
|
|
if is_mp3(file_path): |
|
unique_id = uuid.uuid4() |
|
wav_file_path = f"{os.path.splitext(file_path)[0]}-{unique_id}.wav" |
|
converted_audio = convert_mp3_to_wav(file_path, wav_file_path) |
|
print(f"File converted to {wav_file_path}") |
|
|
|
return converted_audio, gr.update(value=converted_audio, visible=True) |
|
else: |
|
print("The file is not an MP3 file.") |
|
|
|
return file_path, gr.update(value=file_path, visible=True) |
|
|
|
def check_and_convert_webp_to_png(input_path, output_path): |
|
try: |
|
|
|
with Image.open(input_path) as img: |
|
|
|
if img.format == 'WEBP': |
|
|
|
img.save(output_path, 'PNG') |
|
print(f"Converted {input_path} to {output_path}") |
|
return output_path |
|
else: |
|
print(f"The file {input_path} is not in WebP format.") |
|
return input_path |
|
except IOError: |
|
print(f"Cannot open {input_path}. The file might not exist or is not an image.") |
|
|
|
def convert_user_uploded_webp(input_path): |
|
|
|
|
|
input_file = input_path |
|
unique_id = uuid.uuid4() |
|
output_file = f"converted_to_png_portrait-{unique_id}.png" |
|
ready_png = check_and_convert_webp_to_png(input_file, output_file) |
|
print(f"PORTRAIT PNG FILE: {ready_png}") |
|
return ready_png |
|
|
|
def clear_audio_elms(): |
|
return gr.update(value=None, visible=False) |
|
|
|
def change_video_codec(input_file, output_file, codec='libx264', audio_codec='aac'): |
|
try: |
|
( |
|
ffmpeg |
|
.input(input_file) |
|
.output(output_file, vcodec=codec, acodec=audio_codec) |
|
.run(overwrite_output=True) |
|
) |
|
print(f'Successfully changed codec of {input_file} and saved as {output_file}') |
|
except ffmpeg.Error as e: |
|
print(f'Error occurred: {e.stderr.decode()}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_portrait(prompt_image): |
|
if prompt_image is None or prompt_image == "": |
|
raise gr.Error("Can't generate a portrait without a prompt !") |
|
|
|
try: |
|
client = Client("ByteDance/SDXL-Lightning") |
|
except: |
|
raise gr.Error(f"ByteDance/SDXL-Lightning space's api might not be ready, please wait, or upload an image instead.") |
|
|
|
result = client.predict( |
|
prompt = prompt_image, |
|
ckpt = "4-Step", |
|
api_name = "/generate_image" |
|
) |
|
print(result) |
|
|
|
|
|
input_file = result |
|
unique_id = uuid.uuid4() |
|
output_file = f"converted_to_png_portrait-{unique_id}.png" |
|
ready_png = check_and_convert_webp_to_png(input_file, output_file) |
|
print(f"PORTRAIT PNG FILE: {ready_png}") |
|
|
|
return ready_png |
|
|
|
def generate_voice_with_parler(prompt_audio, voice_description): |
|
if prompt_audio is None or prompt_audio == "" : |
|
raise gr.Error(f"Can't generate a voice without text to synthetize !") |
|
if voice_description is None or voice_description == "": |
|
gr.Info( |
|
"For better control, You may want to provide a voice character description next time.", |
|
duration = 10, |
|
visible = True |
|
) |
|
try: |
|
client = Client("parler-tts/parler_tts_mini") |
|
except: |
|
raise gr.Error(f"parler-tts/parler_tts_mini space's api might not be ready, please wait, or upload an audio instead.") |
|
|
|
result = client.predict( |
|
text = prompt_audio, |
|
description = voice_description, |
|
api_name = "/gen_tts" |
|
) |
|
print(result) |
|
return result, gr.update(value=result, visible=True) |
|
|
|
def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone): |
|
try: |
|
client = Client("collabora/WhisperSpeech") |
|
except: |
|
raise gr.Error(f"collabora/WhisperSpeech space's api might not be ready, please wait, or upload an audio instead.") |
|
|
|
result = client.predict( |
|
multilingual_text = prompt_audio_whisperspeech, |
|
speaker_audio = handle_file(audio_to_clone), |
|
speaker_url = "", |
|
cps = 14, |
|
api_name = "/whisper_speech_demo" |
|
) |
|
print(result) |
|
return result, gr.update(value=result, visible=True) |
|
|
|
def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone): |
|
try: |
|
client = Client("amphion/maskgct") |
|
except: |
|
raise gr.Error(f"amphion/maskgct space's api might not be ready, please wait, or upload an audio instead.") |
|
|
|
result = client.predict( |
|
prompt_wav = handle_file(audio_to_clone), |
|
target_text = prompt_audio_maskGCT, |
|
target_len=-1, |
|
n_timesteps=25, |
|
api_name="/predict" |
|
) |
|
print(result) |
|
return result, gr.update(value=result, visible=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): |
|
|
|
unique_id = uuid.uuid4() |
|
|
|
args = argparse.Namespace( |
|
config = 'configs/inference/default.yaml', |
|
source_image = source_image, |
|
driving_audio = driving_audio, |
|
output = f'output-{unique_id}.mp4', |
|
pose_weight = 1.0, |
|
face_weight = 1.0, |
|
lip_weight = 1.0, |
|
face_expand_ratio = 1.2, |
|
checkpoint = None |
|
) |
|
|
|
inference_process(args) |
|
return f'output-{unique_id}.mp4' |
|
|
|
def generate_talking_portrait(portrait, voice, progress=gr.Progress(track_tqdm=True)): |
|
|
|
if portrait is None: |
|
raise gr.Error("Please provide a portrait to animate.") |
|
|
|
if voice is None: |
|
raise gr.Error("Please provide audio (4 seconds max).") |
|
|
|
if is_shared_ui : |
|
|
|
input_file = voice |
|
unique_id = uuid.uuid4() |
|
trimmed_output_file = f"-{unique_id}.wav" |
|
trimmed_output_file = trim_audio(input_file, trimmed_output_file, AUDIO_MAX_DURATION) |
|
voice = trimmed_output_file |
|
|
|
|
|
ready_audio = add_silence_to_wav(voice) |
|
print(f"1 second of silence added to {voice}") |
|
|
|
|
|
talking_portrait_vid = run_hallo(portrait, ready_audio) |
|
|
|
|
|
|
|
final_output_file = f"converted_{talking_portrait_vid}" |
|
change_video_codec(talking_portrait_vid, final_output_file) |
|
|
|
return final_output_file |
|
|
|
|
|
css = ''' |
|
#col-container { |
|
margin: 0 auto; |
|
} |
|
#column-names { |
|
margin-top: 50px; |
|
} |
|
#main-group { |
|
background-color: none; |
|
} |
|
.tabs { |
|
background-color: unset; |
|
} |
|
#image-block { |
|
flex: 1; |
|
} |
|
#video-block { |
|
flex: 9; |
|
} |
|
#audio-block, #audio-clone-elm, audio-clone-elm-maskGCT { |
|
flex: 1; |
|
} |
|
div#audio-clone-elm > .audio-container > button { |
|
height: 180px!important; |
|
} |
|
div#audio-clone-elm > .audio-container > button > .wrap { |
|
font-size: 0.9em; |
|
} |
|
div#audio-clone-elm-maskGCT > .audio-container > button { |
|
height: 180px!important; |
|
} |
|
div#audio-clone-elm-maskGCT > .audio-container > button > .wrap { |
|
font-size: 0.9em; |
|
} |
|
#text-synth, #voice-desc{ |
|
height: 130px; |
|
} |
|
#text-synth-wsp { |
|
height: 120px; |
|
} |
|
#text-synth-maskGCT { |
|
height: 120px; |
|
} |
|
#audio-column, #result-column { |
|
display: flex; |
|
} |
|
#gen-voice-btn { |
|
flex: 1; |
|
} |
|
#parler-tab, #whisperspeech-tab, #maskGCT-tab { |
|
padding: 0; |
|
} |
|
#main-submit{ |
|
flex: 1; |
|
} |
|
#pro-tips { |
|
margin-top: 50px; |
|
} |
|
div#warning-ready { |
|
background-color: #ecfdf5; |
|
padding: 0 16px 16px; |
|
margin: 20px 0; |
|
color: #030303!important; |
|
} |
|
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { |
|
color: #057857!important; |
|
} |
|
div#warning-duplicate { |
|
background-color: #ebf5ff; |
|
padding: 0 16px 16px; |
|
margin: 20px 0; |
|
color: #030303!important; |
|
} |
|
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { |
|
color: #0f4592!important; |
|
} |
|
div#warning-duplicate strong { |
|
color: #0f4592; |
|
} |
|
p.actions { |
|
display: flex; |
|
align-items: center; |
|
margin: 20px 0; |
|
} |
|
div#warning-duplicate .actions a { |
|
display: inline-block; |
|
margin-right: 10px; |
|
} |
|
.dark #warning-duplicate { |
|
background-color: #0c0c0c !important; |
|
border: 1px solid white !important; |
|
} |
|
div#component-8 { |
|
align-items: stretch; |
|
} |
|
''' |
|
|
|
with gr.Blocks(css=css) as demo: |
|
with gr.Column(elem_id="col-container"): |
|
gr.Markdown(""" |
|
# TTS x Hallo Talking Portrait Generator |
|
|
|
This demo allows you to generate a talking portrait with the help of several open-source projects: SDXL Lightning | Parler TTS | WhisperSpeech | Hallo |
|
|
|
To let the community try and enjoy this demo, video length is limited to 4 seconds audio maximum. |
|
|
|
Duplicate this space to skip the queue and get unlimited video duration. 4-5 seconds of audio will take ~5 minutes per inference, please be patient. |
|
""") |
|
with gr.Row(elem_id="column-names"): |
|
gr.Markdown("## 1. Load Portrait") |
|
gr.Markdown("## 2. Load Voice") |
|
gr.Markdown("## 3. Result") |
|
with gr.Group(elem_id="main-group"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
portrait = gr.Image( |
|
sources = ["upload"], |
|
type = "filepath", |
|
format = "png", |
|
elem_id = "image-block" |
|
) |
|
|
|
prompt_image = gr.Textbox( |
|
label = "Generate image", |
|
lines = 2, |
|
max_lines = 2 |
|
) |
|
|
|
gen_image_btn = gr.Button("Generate portrait (optional)") |
|
|
|
with gr.Column(elem_id="audio-column"): |
|
|
|
voice = gr.Audio( |
|
type = "filepath", |
|
elem_id = "audio-block" |
|
) |
|
|
|
preprocess_audio_file = gr.File(visible=False) |
|
|
|
|
|
with gr.Tab("Parler TTS", elem_id="parler-tab"): |
|
|
|
prompt_audio = gr.Textbox( |
|
label = "Text to synthetize", |
|
lines = 3, |
|
max_lines = 3, |
|
elem_id = "text-synth" |
|
) |
|
|
|
voice_description = gr.Textbox( |
|
label = "Voice description", |
|
lines = 3, |
|
max_lines = 3, |
|
elem_id = "voice-desc" |
|
) |
|
|
|
gen_voice_btn = gr.Button("Generate voice (optional)") |
|
|
|
with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"): |
|
prompt_audio_whisperspeech = gr.Textbox( |
|
label = "Text to synthetize", |
|
lines = 2, |
|
max_lines = 2, |
|
elem_id = "text-synth-wsp" |
|
) |
|
audio_to_clone = gr.Audio( |
|
label = "Voice to clone", |
|
type = "filepath", |
|
elem_id = "audio-clone-elm" |
|
) |
|
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)") |
|
|
|
with gr.Tab("MaskGCT TTS", elem_id="maskGCT-tab"): |
|
prompt_audio_maskGCT = gr.Textbox( |
|
label = "Text to synthetize", |
|
lines = 2, |
|
max_lines = 2, |
|
elem_id = "text-synth-maskGCT" |
|
) |
|
audio_to_clone_maskGCT = gr.Audio( |
|
label = "Voice to clone", |
|
type = "filepath", |
|
elem_id = "audio-clone-elm-maskGCT" |
|
) |
|
gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)") |
|
|
|
with gr.Column(elem_id="result-column"): |
|
|
|
result = gr.Video( |
|
elem_id="video-block" |
|
) |
|
|
|
submit_btn = gr.Button("Go talking Portrait !", elem_id="main-submit") |
|
|
|
with gr.Row(elem_id="pro-tips"): |
|
gr.Markdown(""" |
|
# Hallo Pro Tips: |
|
|
|
Hallo has a few simple requirements for input data: |
|
|
|
For the source image: |
|
|
|
1. It should be cropped into squares. |
|
2. The face should be the main focus, making up 50%-70% of the image. |
|
3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles). |
|
|
|
For the driving audio: |
|
|
|
1. It must be in WAV format. |
|
2. It must be in English since our training datasets are only in this language. |
|
3. Ensure the vocals are clear; background music is acceptable. |
|
|
|
|
|
""") |
|
|
|
gr.Markdown(""" |
|
# TTS Pro Tips: |
|
|
|
For Parler TTS: |
|
|
|
- Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise |
|
- Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech |
|
- The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt |
|
|
|
For WhisperSpeech: |
|
|
|
WhisperSpeech is able to quickly clone a voice from an audio sample. |
|
|
|
- Upload a voice sample in the WhisperSpeech tab |
|
- Add text to synthetize, hit Generate voice clone button |
|
|
|
""") |
|
|
|
portrait.upload( |
|
fn = convert_user_uploded_webp, |
|
inputs = [portrait], |
|
outputs = [portrait], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
voice.upload( |
|
fn = check_mp3, |
|
inputs = [voice], |
|
outputs = [voice, preprocess_audio_file], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
voice.clear( |
|
fn = clear_audio_elms, |
|
inputs = None, |
|
outputs = [preprocess_audio_file], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
gen_image_btn.click( |
|
fn = generate_portrait, |
|
inputs = [prompt_image], |
|
outputs = [portrait], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
gen_voice_btn.click( |
|
fn = generate_voice_with_parler, |
|
inputs = [prompt_audio, voice_description], |
|
outputs = [voice, preprocess_audio_file], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
gen_wsp_voice_btn.click( |
|
fn = get_whisperspeech, |
|
inputs = [prompt_audio_whisperspeech, audio_to_clone], |
|
outputs = [voice, preprocess_audio_file], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
gen_maskGCT_voice_btn.click( |
|
fn = get_maskGCT_TTS, |
|
inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT], |
|
outputs = [voice, preprocess_audio_file], |
|
queue = False, |
|
show_api = False |
|
) |
|
|
|
submit_btn.click( |
|
fn = generate_talking_portrait, |
|
inputs = [portrait, voice], |
|
outputs = [result], |
|
show_api = False |
|
) |
|
|
|
|
|
demo.queue(max_size=2).launch(show_error=True, show_api=False) |