import os import gradio as gr from openai import OpenAI from playdiffusion import PlayDiffusion, InpaintInput, TTSInput, RVCInput inpainter = PlayDiffusion() _whisper_client = None def get_whisper_client(): global _whisper_client if _whisper_client is None: _whisper_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) return _whisper_client def run_asr(audio): whisper_client = get_whisper_client() audio_file = open(audio, "rb") transcript = whisper_client.audio.transcriptions.create( file=audio_file, model="whisper-1", response_format="verbose_json", timestamp_granularities=["word"] ) word_times = [{ "word": word.word, "start": word.start, "end": word.end } for word in transcript.words] return transcript.text, transcript.text, word_times def run_inpainter(input_text, output_text, word_times, audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio): if not use_manual_ratio: audio_token_syllable_ratio = None return inpainter.inpaint(InpaintInput(input_text=input_text, output_text=output_text, input_word_times=word_times, audio=audio, num_steps=num_steps, init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk, audio_token_syllable_ratio=audio_token_syllable_ratio)) def run_inpainter_tts(input_text, voice_audio, num_steps, init_temp, init_diversity, guidance, rescale, topk, use_manual_ratio, audio_token_syllable_ratio): if not use_manual_ratio: audio_token_syllable_ratio = None return inpainter.tts(TTSInput(output_text=input_text, voice=voice_audio, num_steps=num_steps, init_temp=init_temp, init_diversity=init_diversity, guidance=guidance, rescale=rescale, topk=topk, audio_token_syllable_ratio=audio_token_syllable_ratio)) def toggle_ratio_input(use_manual): return gr.update(visible=use_manual, interactive=use_manual) def create_advanced_options_accordion(): with gr.Accordion("Advanced options", open=False): num_steps_slider = gr.Slider(1, 100, 30, step=1, label="number of sampling steps codebook") init_temp_slider = gr.Slider(0.5, 10, 1, step=0.1, label="Initial temperature") init_diversity_slider = gr.Slider(0, 10, 1, step=0.1, label="Initial diversity") guidance_slider = gr.Slider(0, 10, 0.5, step=0.1, label="guidance") rescale_slider = gr.Slider(0, 1, 0.7, step=0.1, label="guidance rescale factor") topk_slider = gr.Slider(1, 10000, 25, step=1, label="sampling from top-k logits") gr.Markdown("#### Audio Token Syllable Ratio") gr.Markdown("*Automatic calculation (recommended) provides the best results in most cases.*") use_manual_ratio = gr.Checkbox(label="Use manual audio token syllable ratio", value=False) audio_token_syllable_ratio = gr.Number( label="Audio token syllable ratio (manual)", value=12.5, precision=2, minimum=5.0, maximum=25.0, visible=False, interactive=False ) use_manual_ratio.change( toggle_ratio_input, inputs=[use_manual_ratio], outputs=[audio_token_syllable_ratio] ) return (num_steps_slider, init_temp_slider, init_diversity_slider, guidance_slider, rescale_slider, topk_slider, use_manual_ratio, audio_token_syllable_ratio) def speech_rvc(rvc_source_speech, rvc_target_voice): return inpainter.rvc(RVCInput(source_speech=rvc_source_speech, target_voice=rvc_target_voice)) if __name__ == '__main__': with gr.Blocks(analytics_enabled=False, title="PlayDiffusion") as demo: gr.Markdown("## PlayDiffusion") with gr.Tab("Inpaint"): gr.Markdown("### Upload an audio file and run ASR to get the text.") gr.Markdown("### Then, specify the desired output text.") gr.Markdown("### Run the inpainter to generate the modified audio.") gr.Markdown("### Note: The model and demo are currently targeted for English.") inpaint_advanced_options = create_advanced_options_accordion() with gr.Row(): audio_input = gr.Audio(label="Upload audio to be modified", sources=["upload", "microphone"], type="filepath") with gr.Row(): asr_submit = gr.Button("Run ASR") with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Input text from ASR", interactive=False) text_output = gr.Textbox(label="Desired output text") with gr.Column(): word_times = gr.JSON(label="Word times from ASR") with gr.Row(): inpainter_submit = gr.Button("Run Inpainter") with gr.Row(): audio_output = gr.Audio(label="Output audio") asr_submit.click(run_asr, inputs=[audio_input], outputs=[text_input, text_output, word_times]) inpainter_submit.click( run_inpainter, inputs=[text_input, text_output, word_times, audio_input] + list(inpaint_advanced_options), outputs=[audio_output]) with gr.Tab("Text to Speech"): gr.Markdown("### Text to Speech") tts_advanced_options = create_advanced_options_accordion() tts_text = gr.Textbox(label="TTS Input", placeholder="Enter text to convert to speech", lines=2) tts_voice = gr.Audio(label="Voice to use for TTS", sources=["upload", "microphone"], type="filepath", ) tts_submit = gr.Button("Convert to Speech") tts_output = gr.Audio(label="Generated Speech") tts_submit.click( run_inpainter_tts, inputs=[tts_text, tts_voice] + list(tts_advanced_options), outputs=[tts_output] ) with gr.Tab("Voice Conversion"): gr.Markdown("### Real Time Voice Conversion (works best for english)") rvc_source_speech = gr.Audio(label="Source Conversion Speech", sources=["upload", "microphone"], type="filepath", ) rvc_target_voice = gr.Audio(label="Target Voice", sources=["upload", "microphone"], type="filepath", ) rvc_submit = gr.Button("Real time Voice Conversion") rvc_output = gr.Audio(label="Converted Speech") rvc_submit.click( speech_rvc, inputs=[rvc_source_speech, rvc_target_voice], outputs=[rvc_output] ) demo.launch(share=True)