import os import sys import uuid import subprocess import gradio as gr from pydub import AudioSegment from TTS.api import TTS # Импорт необходимых модулей для обеих функций # Глобальные переменные и настройки language_options = { "English (en)": "en", "Spanish (es)": "es", "French (fr)": "fr", "German (de)": "de", "Italian (it)": "it", "Portuguese (pt)": "pt", "Polish (pl)": "pl", "Turkish (tr)": "tr", "Russian (ru)": "ru", "Dutch (nl)": "nl", "Czech (cs)": "cs", "Arabic (ar)": "ar", "Chinese (zh-cn)": "zh-cn", "Japanese (ja)": "ja", "Hungarian (hu)": "hu", "Korean (ko)": "ko", "Hindi (hi)": "hi" } other_language = { "Vietnamese": "vie", "Serbian": "srp", "Romanian": "ron", "Indonesian": "ind", "Philippine": "tgl" } tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") # Функции для голосового клонирования def clean_audio(audio_path): out_filename = f"output/cleaned_{uuid.uuid4()}.wav" lowpass_highpass = "lowpass=8000,highpass=75," trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02," try: shell_command = f"ffmpeg -y -i {audio_path} -af {lowpass_highpass}{trim_silence} {out_filename}".split() subprocess.run(shell_command, capture_output=True, check=True) print(f"Audio cleaned and saved to {out_filename}") return out_filename except subprocess.CalledProcessError as e: print(f"Error during audio cleaning: {e}") return audio_path def check_audio_length(audio_path, max_duration=120): try: audio = AudioSegment.from_file(audio_path) duration = audio.duration_seconds if duration > max_duration: print(f"Audio is too long: {duration} seconds. Max allowed is {max_duration} seconds.") return False return True except Exception as e: print(f"Error while checking audio length: {e}") return False def synthesize_and_convert_voice(text, language_iso, voice_audio_path, speed): tts_synthesis = TTS(model_name=f"tts_models/{language_iso}/fairseq/vits", ) wav_data = tts_synthesis.tts(text, speed=speed) tts_conversion = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False) output_file = "output/docout.wav" os.makedirs("output", exist_ok=True) converted_audio = tts_conversion.voice_conversion_to_file(wav_data, target_wav=voice_audio_path, file_path=output_file) return converted_audio def synthesize_speech(text, speaker_wav_path, language_iso, speed): output_file_xtts = "output/undocout.wav" tts.tts_to_file(text=text, file_path=output_file_xtts, speed=speed, speaker_wav=speaker_wav_path, language=language_iso) tts_conversion = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False) output_file = "output/docout.wav" os.makedirs("output", exist_ok=True) converted_audio = tts_conversion.voice_conversion_to_file(output_file_xtts, target_wav=speaker_wav_path, file_path=output_file) return converted_audio def get_language_code(selected_language): if selected_language in language_options: return language_options[selected_language] elif selected_language in other_language: return other_language[selected_language] else: return None def process_speech(text, speaker_wav, selected_language, speed): language_code = get_language_code(selected_language) if language_code is None: raise ValueError("Выбранный язык не поддерживается.") # Проверка длины аудио if not check_audio_length(speaker_wav): error_message = "Длина аудио превышает допустимый лимит в 2 минуты." error = gr.Error(error_message, duration=5) raise error cleaned_wav_path = clean_audio(speaker_wav) if selected_language in other_language: return synthesize_and_convert_voice(text, language_code, cleaned_wav_path, speed) else: return synthesize_speech(text, cleaned_wav_path, language_code, speed) def restart_program(): python = sys.executable os.execl(python, python, *sys.argv) # Функции для липсинка def generate(video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, save_as_video): if video is None or audio is None or checkpoint is None: return "Пожалуйста, загрузите видео/изображение и аудио файл, а также выберите чекпойнт." print(f"Текущая рабочая директория: {os.getcwd()}") print(f"Содержимое текущей директории: {os.listdir('.')}") print(f"Проверка наличия 'inference.py': {os.path.exists('inference.py')}") video_path = video # Путь к видео или изображению audio_path = audio # Путь к аудио print(f"Путь к видео: {video_path}") print(f"Путь к аудио: {audio_path}") output_dir = "outputs" os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "output.mp4") print(f"Путь к выходному файлу: {output_path}") args = [ "--checkpoint_path", f"checkpoints/{checkpoint}.pth", "--segmentation_path", "checkpoints/face_segmentation.pth", "--no_seg", "--no_sr", "--face", video_path, "--audio", audio_path, "--outfile", output_path, "--resize_factor", str(resize_factor), "--face_det_batch_size", "4", "--wav2lip_batch_size", "64", "--fps", "30", "--pads", str(pad_top), str(pad_bottom), str(pad_left), str(pad_right) ] if no_smooth: args.append("--nosmooth") if save_as_video: args.append("--save_as_video") try: cmd = ["python", "inference.py"] + args print(f"Запуск инференса с командой: {' '.join(cmd)}") subprocess.run(cmd, check=True) except subprocess.CalledProcessError as e: print(f"Ошибка при выполнении команды: {e}") return f"Произошла ошибка при обработке: {e}" if not os.path.exists(output_path): print("Выходной файл не существует.") return "Не удалось создать выходное видео." print(f"Выходной файл создан по пути: {output_path}") return output_path # Возвращаем путь к выходному видео # Создание Gradio интерфейса с вкладками with gr.Blocks() as app: gr.Markdown("# Voice Clone Union") with gr.Tabs(): with gr.TabItem("Voice Clone"): # Интерфейс для голосового клонирования text_input = gr.Textbox(label="Введите текст для генерации", placeholder="Введите ваш текст здесь...") speaker_wav_input = gr.Audio(label="Загрузите аудио файла говорящего (WAV формат)", type="filepath") all_languages = list(language_options.keys()) + list(other_language.keys()) language_input = gr.Dropdown( label="Язык", choices=all_languages, value="English (en)" ) speed_input = gr.Slider( label="Скорость синтеза", minimum=0.1, maximum=10, step=0.1, value=1.0, info="Выберите скорость" ) output_audio = gr.Audio(label="Сгенерированное аудио", type="filepath") with gr.Row(): synthesize_button = gr.Button("Сгенерировать") gr.HTML("
") reload_button = gr.Button("Перезапустить") synthesize_button.click( fn=process_speech, inputs=[text_input, speaker_wav_input, language_input, speed_input], outputs=output_audio ) reload_button.click(fn=restart_program, inputs=None, outputs=None) with gr.TabItem("Lipsync"): # Интерфейс для липсинка gr.Markdown("## Lipsync") with gr.Row(): video = gr.File(label="Видео или Изображение", type="filepath") audio = gr.File(label="Аудио", type="filepath") with gr.Column(): checkpoint = gr.Radio(["wav2lip", "wav2lip_gan"], label="Чекпойнт", value="wav2lip_gan", visible=False) no_smooth = gr.Checkbox(label="Без сглаживания", value=False) resize_factor = gr.Slider(minimum=1, maximum=4, step=1, label="Фактор изменения размера", value=2) with gr.Row(): with gr.Column(): pad_top = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ сверху") pad_bottom = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Отступ снизу") pad_left = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ слева") pad_right = gr.Slider(minimum=0, maximum=50, step=1, value=0, label="Отступ справа") save_as_video = gr.Checkbox(label="Сохранять как видео", value=True) generate_btn = gr.Button("Сгенерировать") with gr.Column(): result = gr.Video(label="Результат") generate_btn.click( generate, inputs=[video, audio, checkpoint, no_smooth, resize_factor, pad_top, pad_bottom, pad_left, pad_right, save_as_video], outputs=result, # concurrency_limit=30 ) def launch_gradio(): app.launch( share="True" in sys.argv, inbrowser="--open" in sys.argv, server_port=8600, server_name="0.0.0.0", ) if __name__ == "__main__": launch_gradio()