import os import sys import torch import gradio as gr from pydub import AudioSegment import mimetypes sys.path.append('./Amphion') import Amphion.models.vc.vevo.vevo_utils as vevo_utils from huggingface_hub import snapshot_download def load_model(): device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # Content Tokenizer local_dir = snapshot_download( repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq32/*"], ) content_tokenizer_ckpt_path = os.path.join( local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl" ) # Content-Style Tokenizer local_dir = snapshot_download( repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"], ) content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192") # Autoregressive Transformer local_dir = snapshot_download( repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"], ) ar_cfg_path = "./config/Vq32ToVq8192.json" ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192") # Flow Matching Transformer local_dir = snapshot_download( repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vq8192ToMels/*"], ) fmt_cfg_path = "./config/Vq8192ToMels.json" fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels") # Vocoder local_dir = snapshot_download( repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"], ) vocoder_cfg_path = "./Amphion/models/vc/vevo/config/Vocoder.json" vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder") pipeline = vevo_utils.VevoInferencePipeline( content_tokenizer_ckpt_path=content_tokenizer_ckpt_path, content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path, ar_cfg_path=ar_cfg_path, ar_ckpt_path=ar_ckpt_path, fmt_cfg_path=fmt_cfg_path, fmt_ckpt_path=fmt_ckpt_path, vocoder_cfg_path=vocoder_cfg_path, vocoder_ckpt_path=vocoder_ckpt_path, device=device ) return pipeline def convert_to_wav(audio_path): if audio_path is None: return None mime, _ = mimetypes.guess_type(audio_path) if mime == 'audio/wav' or mime == 'audio/x-wav': return audio_path elif mime == 'audio/mpeg': seg = AudioSegment.from_mp3(audio_path) wav_path = audio_path.rsplit('.', 1)[0] + '.wav' seg.export(wav_path, format="wav") return wav_path else: raise ValueError(f"Unsupported audio format: {mime}") def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio, src_text, ref_text, src_language, ref_language, steps, progress=gr.Progress()): try: # Convert uploaded audio files to WAV if needed if content_audio: content_path = convert_to_wav(content_audio) else: content_path = None if ref_style_audio: ref_style_path = convert_to_wav(ref_style_audio) else: ref_style_path = None if ref_timbre_audio: ref_timbre_path = convert_to_wav(ref_timbre_audio) else: ref_timbre_path = None # Run inference based on mode if mode == 'voice': if not all([content_path, ref_style_path, ref_timbre_path]): raise ValueError("Voice mode requires all audio inputs") gen_audio = inference_pipeline.inference_ar_and_fm( src_wav_path=content_path, src_text=None, style_ref_wav_path=ref_style_path, timbre_ref_wav_path=ref_timbre_path, flow_matching_steps=steps ) elif mode == 'timbre': if not all([content_path, ref_timbre_path]): raise ValueError("Timbre mode requires source and timbre reference audio") gen_audio = inference_pipeline.inference_fm( src_wav_path=content_path, timbre_ref_wav_path=ref_timbre_path, flow_matching_steps=steps ) elif mode == 'tts': if not all([ref_style_path, ref_timbre_path, src_text]): raise ValueError("TTS mode requires style audio, timbre audio, and source text") gen_audio = inference_pipeline.inference_ar_and_fm( src_wav_path=None, src_text=src_text, style_ref_wav_path=ref_style_path, timbre_ref_wav_path=ref_timbre_path, style_ref_wav_text=ref_text if ref_text else None, src_text_language=src_language, style_ref_wav_text_language=ref_language ) # Save and return the generated audio output_path = "output.wav" vevo_utils.save_audio(gen_audio, target_sample_rate=48000, output_path=output_path) return output_path except Exception as e: raise gr.Error(str(e)) # Initialize the model print("Loading model...") inference_pipeline = load_model() print("Model loaded successfully!") # Create the Gradio interface with gr.Blocks(title="Vevo Voice Conversion") as demo: gr.Markdown("# Vevo Voice Conversion") with gr.Row(): mode = gr.Radio( choices=["voice", "timbre", "tts"], value="timbre", label="Inference Mode" ) with gr.Row(): with gr.Column(): content_audio = gr.Audio( label="Source Audio", type="filepath" ) ref_style_audio = gr.Audio( label="Reference Style Audio", type="filepath" ) ref_timbre_audio = gr.Audio( label="Reference Timbre Audio", type="filepath" ) with gr.Column(): src_text = gr.Textbox( label="Source Text", placeholder="Enter text for TTS mode", visible=False ) ref_text = gr.Textbox( label="Reference Style Text", placeholder="Optional: Enter reference text", visible=False ) src_language = gr.Dropdown( choices=["en", "zh"], value="en", label="Source Language", visible=False ) ref_language = gr.Dropdown( choices=["en", "zh"], value="en", label="Reference Language", visible=False ) with gr.Row(): steps = gr.Slider( minimum=1, maximum=64, value=32, step=1, label="Flow Matching Steps" ) with gr.Row(): submit_btn = gr.Button("Generate") output_audio = gr.Audio(label="Generated Audio") # Handle visibility of components based on mode def update_visibility(mode): is_tts = mode == "tts" is_voice = mode == "voice" is_timbre = mode == "timbre" return { content_audio: not is_tts, ref_style_audio: not is_timbre, src_text: is_tts, ref_text: is_tts, src_language: is_tts, ref_language: is_tts } mode.change( fn=update_visibility, inputs=[mode], outputs=[content_audio, ref_style_audio, src_text, ref_text, src_language, ref_language] ) # Handle generation submit_btn.click( fn=process_audio, inputs=[ mode, content_audio, ref_style_audio, ref_timbre_audio, src_text, ref_text, src_language, ref_language, steps ], outputs=output_audio ) if __name__ == "__main__": demo.launch()