import gradio as gr import numpy as np import io import os from openai import OpenAI from pydub import AudioSegment from pydub.playback import play # Set an environment variable for key os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY') client = OpenAI() # add api_key def stream_and_yield_audio(text, model, voice): response = client.audio.speech.create( model=model, #"tts-1", for example voice=voice , #"alloy", for example input=text, ) # Convert the binary response content to a byte stream byte_stream = io.BytesIO(response.content) # Read the audio data from the byte stream audio = AudioSegment.from_file(byte_stream, format="mp3") # Export the audio as WAV format sample_width = audio.sample_width sample_rate = audio.frame_rate audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16) # Yield the audio data yield sample_rate, audio_data #audio_data.tobytes(), sample_width # demo using older gradio version (3.50.2) with gr.Blocks() as demo: with gr.Row(): model = gr.Dropdown(choices=['tts-1','tts-1-hd'], label='Model', value='tts-1') voice = gr.Dropdown(choices=['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], label='Voice Options', value='alloy') text = gr.Textbox(label="Input text") btn = gr.Button("Greet") output_audio = gr.Audio(label="Speech Output", streaming=True, autoplay=True) btn.click(fn=stream_and_yield_audio, inputs=[text,model, voice], outputs=output_audio, api_name="tts-stream") demo.queue().launch()