Spaces:

leeoxiang
/

tts-streaming-latency

Runtime error

File size: 5,646 Bytes

7d2db7e
 
 
4958e9a

import gradio as gr


import os
import time
import azure.cognitiveservices.speech as speechsdk
from pyht import Client
from pyht.client import TTSOptions
import requests

text = 'Today is Sunday, the weather is sunny. I am here to test the delay of various TTS services thoroughly'


def azure_tts(text):
    
    speech_key = os.getenv('SPEECH_KEY')
    speech_regoion = os.getenv('SPEECH_REGION')
    if speech_key is None or speech_regoion is None:
        print('Please set the environment variables SPEECH_KEY and SPEECH_REGION')
        exit(1)
        
        
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_regoion)
    speech_config.speech_synthesis_voice_name = 'en-US-JennyNeural'
    speech_config.speech_synthesis_language = "en-US"

    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
    pull_stream = speechsdk.audio.PullAudioOutputStream()
    stream_config = speechsdk.audio.AudioOutputConfig(stream=pull_stream)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config)
    
    speech_synthesizer.speak_text_async(text)

    azure_latency = 0
    start = time.perf_counter()

    audio_buffer = bytes(512)
    filled_size = pull_stream.read(audio_buffer)

    end = time.perf_counter()
    azure_latency = end - start
    return azure_latency





def coqui_tts(text):
    
    voice_id = 'c791b5b5-0558-42b8-bb0b-602ac5efc0b9'
    
    COQUI_API_TOKEN = os.getenv["COQUI_TOKEN"]
    
    start = time.perf_counter()
    res = requests.post(
        "https://app.coqui.ai/api/v2/samples/xtts/stream",
        json={ 
            "text": text, 
            "language": 'en', 
            "voice_id": voice_id},
            headers={"Authorization": f"Bearer {COQUI_API_TOKEN}"},
            stream=True,
        )

    if res.status_code != 201:
        print(f"Endpoint failed with status code {res.status_code}:",
                res.content.decode("utf-8"))
        return 0
    
    first = True
    for chunk in res.iter_content(chunk_size=512):
        if first:
            end = time.perf_counter()
            coqui_latency = end-start
            return coqui_latency
        



def elevenlab_tts(text):
    voice_id = '21m00Tcm4TlvDq8ikWAM'
    CHUNK_SIZE = 512
    url = f'https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream'
    
    xi_api_key = os.getenv['ELEVENLAB_KEY']
    if xi_api_key is None:
        print('Please set the environment variable ELEVENLAB_KEY')
        exit(1)

    headers = {
    "Accept": "audio/mpeg",
    "Content-Type": "application/json",
    "xi-api-key": xi_api_key
    }

    data = {
    "text": text,
    "model_id": "eleven_multilingual_v2",
    "voice_settings": {
        "stability": 0.5,
        "similarity_boost": 0.5
    }
    }

    start = time.perf_counter()
    response = requests.post(url, json=data, headers=headers, stream=True)


    first = True
    for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
        if first:
            first = False
            end = time.perf_counter()
            elevenlab_latency = end - start
            return elevenlab_latency



def playht_tts(text):
    userid = os.getenv("PLAY_HT_USER_ID")
    api_key = os.getenv("PLAY_HT_API_KEY")
    
    if userid is None or api_key is None:
        print('Please set the environment variables PLAY_HT_USER_ID and PLAY_HT_API_KEY')
        exit(1)

    client = Client(
        user_id=userid,
        api_key=api_key)
    

    options = TTSOptions(voice="s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",speed=5.0)
    first = True
    start = time.perf_counter()

    res = client.tts(text, options)
    for chunk in res:
        # do something with the audio chunk
        if first:
            first = False
            end = time.perf_counter()
            playht_latency = end - start
            return playht_latency
        


title = """<h1 align="center">🔥TRTC 文档机器人🚀</h1>"""

def greet(input):

    azure_latency = azure_tts(input)

    coqui_latency = coqui_tts(input)

    elevenlab_latency = elevenlab_tts(input)

    playht_latency = playht_tts(input)

    print(f'Elevenlab TTS Delay, Time to first chunk {elevenlab_latency}s \n Azure TTS Delay, Time to first chunk {azure_latency}s \n Coqui TTS Delay, Time to first chunk {coqui_latency}s \n Pyht TTS Delay, Time to first chunk {playht_latency}s')
    
    return f'Elevenlab TTS Delay, Time to first chunk {elevenlab_latency}s \n Azure TTS Delay, Time to first chunk {azure_latency}s \n Coqui TTS Delay, Time to first chunk {coqui_latency}s \n Pyht TTS Delay, Time to first chunk {playht_latency}s'


with gr.Blocks(theme=gr.themes.Default(spacing_size=gr.themes.sizes.spacing_sm, radius_size=gr.themes.sizes.radius_sm, text_size=gr.themes.sizes.text_sm)) as demo:

    gr.HTML(title)
   
    with gr.Row():
        txt = gr.Textbox(show_label=False, lines=1,
                         placeholder='input the text to run ')
        outtxt = gr.Textbox(show_label=False, lines=4,
                         placeholder='the output text')
        
        txt.submit(greet, [txt], [outtxt])        
        submit = gr.Button(value="Submmit", variant="secondary").style(
            full_width=False)
        submit.click(greet, [txt], [outtxt])

    gr.Examples(
        label="for example",
        examples=[
            "Today is Sunday, the weather is sunny. I am here to test the delay of various TTS services thoroughly",
        ],
        inputs=txt,
    )

demo.launch()