mainmainminavoiceclone

Running

File size: 12,248 Bytes

import os
import sys
import uuid
import subprocess
import gradio as gr
from pydub import AudioSegment
from TTS.api import TTS

# # Set environment variables to accept license terms
os.environ["COQUI_TOS_AGREED"] = "1"





# Глобальные переменные и настройки
language_options = {
    "English (en)": "en",
    "Spanish (es)": "es",
    "French (fr)": "fr",
    "German (de)": "de",
    "Italian (it)": "it",
    "Portuguese (pt)": "pt",
    "Polish (pl)": "pl",
    "Turkish (tr)": "tr",
    "Russian (ru)": "ru",
    "Dutch (nl)": "nl",
    "Czech (cs)": "cs",
    "Arabic (ar)": "ar",
    "Chinese (zh-cn)": "zh-cn",
    "Japanese (ja)": "ja",
    "Hungarian (hu)": "hu",
    "Korean (ko)": "ko",
    "Hindi (hi)": "hi"
}

other_language = {
    "Vietnamese": "vie",
    "Serbian": "srp",
    "Romanian": "ron",
    "Indonesian": "ind",
    "Philippine": "tgl"
}

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

# Функции для голосового клонирования
def clean_audio(audio_path):
    out_filename = f"output/cleaned_{uuid.uuid4()}.wav"
    lowpass_highpass = "lowpass=8000,highpass=75,"
    trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
    try:
        shell_command = f"ffmpeg -y -i {audio_path} -af {lowpass_highpass}{trim_silence} {out_filename}".split()
        subprocess.run(shell_command, capture_output=True, check=True)
        print(f"Audio cleaned and saved to {out_filename}")
        return out_filename
    except subprocess.CalledProcessError as e:
        print(f"Error during audio cleaning: {e}")
        return audio_path

def check_audio_length(audio_path, max_duration=120):
    try:
        audio = AudioSegment.from_file(audio_path)
        duration = audio.duration_seconds
        if duration > max_duration:
            print(f"Audio is too long: {duration} seconds. Max allowed is {max_duration} seconds.")
            return False
        return True
    except Exception as e:
        print(f"Error while checking audio length: {e}")
        return False

def synthesize_and_convert_voice(text, language_iso, voice_audio_path, speed):
    tts_synthesis = TTS(model_name=f"tts_models/{language_iso}/fairseq/vits")
    wav_data = tts_synthesis.tts(text, speed=speed)
    tts_conversion = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False)

    # Write wav_data to temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_tts_wav_file:
        temp_tts_wav_path = temp_tts_wav_file.name
        write(temp_tts_wav_path, 22050, wav_data)

    # Prepare output temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_output_wav_file:
        temp_output_wav_path = temp_output_wav_file.name

    tts_conversion.voice_conversion_to_file(temp_tts_wav_path, target_wav=voice_audio_path,
                                            file_path=temp_output_wav_path)

    # Read converted audio from temp_output_wav_path
    output_sample_rate, output_audio_data = read(temp_output_wav_path)

    # Remove temporary files
    os.remove(temp_tts_wav_path)
    os.remove(temp_output_wav_path)

    return (output_sample_rate, output_audio_data)

def synthesize_speech(text, speaker_wav_path, language_iso, speed):
    # Generate speech using tts and save to temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_tts_output:
        temp_tts_output_path = temp_tts_output.name
        tts.tts_to_file(text=text, file_path=temp_tts_output_path, speed=speed,
                        speaker_wav=speaker_wav_path, language=language_iso)

    tts_conversion = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False)

    # Prepare output temporary file
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_output_wav_file:
        temp_output_wav_path = temp_output_wav_file.name

    tts_conversion.voice_conversion_to_file(temp_tts_output_path, target_wav=speaker_wav_path,
                                            file_path=temp_output_wav_path)

    # Read converted audio from temp_output_wav_path
    output_sample_rate, output_audio_data = read(temp_output_wav_path)

    # Remove temporary files
    os.remove(temp_tts_output_path)
    os.remove(temp_output_wav_path)

    return (output_sample_rate, output_audio_data)

def get_language_code(selected_language):
    if selected_language in language_options:
        return language_options[selected_language]
    elif selected_language in other_language:
        return other_language[selected_language]
    else:
        return None

def process_speech(text, speaker_wav_path, selected_language, speed):
    language_code = get_language_code(selected_language)

    if language_code is None:
        raise ValueError("Выбранный язык не поддерживается.")

    if speaker_wav_path is None:
        error_message = "Пожалуйста, загрузите аудио файл говорящего."
        error = gr.Error(error_message, duration=5)
        raise error

    # Check audio length
    audio = AudioSegment.from_file(speaker_wav_path)
    duration = audio.duration_seconds
    if duration > 120:
        error_message = "Длина аудио превышает допустимый лимит в 2 минуты."
        error = gr.Error(error_message, duration=5)
        raise error

    # Clean audio
    cleaned_wav_path = clean_audio(speaker_wav_path)

    if selected_language in other_language:
        output_audio_data = synthesize_and_convert_voice(text, language_code, cleaned_wav_path, speed)
    else:
        output_audio_data = synthesize_speech(text, cleaned_wav_path, language_code, speed)

    # Remove temporary files
    os.remove(cleaned_wav_path)

    return output_audio_data 

def restart_program():
    python = sys.executable
    os.execl(python, python, *sys.argv)

# Функции для липсинка
def generate(video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, save_as_video):
    if video is None or audio is None or checkpoint is None:
        return "Пожалуйста, загрузите видео/изображение и аудио файл, а также выберите чекпойнт."

    print(f"Текущая рабочая директория: {os.getcwd()}")
    print(f"Содержимое текущей директории: {os.listdir('.')}")
    print(f"Проверка наличия 'inference.py': {os.path.exists('inference.py')}")

    video_path = video  # Путь к видео или изображению
    audio_path = audio  # Путь к аудио

    print(f"Путь к видео: {video_path}")
    print(f"Путь к аудио: {audio_path}")

    output_dir = "outputs"
    os.makedirs(output_dir, exist_ok=True)

    output_path = os.path.join(output_dir, "output.mp4")
    print(f"Путь к выходному файлу: {output_path}")

    args = [
        "--checkpoint_path", f"checkpoints/{checkpoint}.pth",
        "--segmentation_path", "checkpoints/face_segmentation.pth",
        "--no_seg",
        "--no_sr",
        "--face", video_path,
        "--audio", audio_path,
        "--outfile", output_path,
        "--resize_factor", str(resize_factor),
        "--face_det_batch_size", "4",
        "--wav2lip_batch_size", "64",
        "--fps", "30",
        "--pads", str(pad_top), str(pad_bottom), str(pad_left), str(pad_right)
    ]

    if no_smooth:
        args.append("--nosmooth")

    if save_as_video:
        args.append("--save_as_video")

    try:
        cmd = ["python", "inference.py"] + args
        print(f"Запуск инференса с командой: {' '.join(cmd)}")
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Ошибка при выполнении команды: {e}")
        return f"Произошла ошибка при обработке: {e}"

    if not os.path.exists(output_path):
        print("Выходной файл не существует.")
        return "Не удалось создать выходное видео."

    print(f"Выходной файл создан по пути: {output_path}")
    return output_path  # Возвращаем путь к выходному видео

# Создание Gradio интерфейса с вкладками
with gr.Blocks() as app:
    gr.Markdown("# Voice Clone Union")

    with gr.Tabs():
        with gr.TabItem("Voice Clone"):
            # Интерфейс для голосового клонирования
            text_input = gr.Textbox(label="Введите текст для генерации", placeholder="Введите ваш текст здесь...")
            speaker_wav_input = gr.Audio(label="Загрузите аудио файла говорящего (WAV формат)", type="filepath")

            all_languages = list(language_options.keys()) + list(other_language.keys())
            language_input = gr.Dropdown(
                label="Язык",
                choices=all_languages,
                value="English (en)"
            )

            speed_input = gr.Slider(
                label="Скорость синтеза",
                minimum=0.1,
                maximum=10,
                step=0.1,
                value=1.0,
                info="Выберите скорость"
            )

            output_audio = gr.Audio(label="Сгенерированное аудио", type="filepath")

            with gr.Row():
                synthesize_button = gr.Button("Сгенерировать")
                gr.HTML("<div style='width:300px;'></div>")
                reload_button = gr.Button("Перезапустить")

            synthesize_button.click(
                fn=process_speech,
                inputs=[text_input, speaker_wav_input, language_input, speed_input],
                outputs=output_audio
            )

            reload_button.click(fn=restart_program, inputs=None, outputs=None)

        with gr.TabItem("Lipsync"):
            # Интерфейс для липсинка
            gr.Markdown("## Lipsync")
            with gr.Row():
                video = gr.File(label="Видео или Изображение", type="filepath")
                audio = gr.File(label="Аудио", type="filepath")
                with gr.Column():
                    checkpoint = gr.Radio(["wav2lip", "wav2lip_gan"], label="Чекпойнт", value="wav2lip_gan", visible=False)
                    no_smooth = gr.Checkbox(label="Без сглаживания", value=False)
                    resize_factor = gr.Slider(minimum=1, maximum=4, step=1, label="Фактор изменения размера", value=2)
            with gr.Row():
                with gr.Column():
                    pad_top = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ сверху")
                    pad_bottom = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Отступ снизу")
                    pad_left = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ слева")
                    pad_right = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ справа")
                    save_as_video = gr.Checkbox(label="Сохранять как видео", value=True)
                    generate_btn = gr.Button("Сгенерировать")
                with gr.Column():
                    result = gr.Video(label="Результат")

            generate_btn.click(
                generate,
                inputs=[video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, save_as_video],
                outputs=result,
                # concurrency_limit=30
            )

    def launch_gradio():
        app.launch(
            
        )

if __name__ == "__main__":
    launch_gradio()