import requests import os import io from os import path from typing import Dict, Literal, TypedDict, Optional import argparse import asyncio import base64 # Environment variables TTS_CLIENT_ID = os.environ.get('TTS_CLIENT_ID') TTS_CLIENT_SECRET = os.environ.get('TTS_CLIENT_SECRET') TTS_API_URL = os.environ.get('TTS_API_URL') if not TTS_CLIENT_ID or not TTS_CLIENT_SECRET or not TTS_API_URL: raise ValueError('Missing environment variables') class TaskResult(TypedDict): task_id: str message: str status: Literal['PENDING', 'SUCCESS', 'FAILED'] audio_url: str # base64 encoded wav audio class Voice(TypedDict): name: str promptText: str promptAudio: str voices: Dict[str, Voice] = { "mk_girl": { "name": "👧 凱婷", "promptText": "我決定咗啦,我要做一件到目前為止又或者永遠都唔會再見到我做嘅事。", "promptAudio": path.join(path.dirname(__file__), "./voices/mk_girl.wav") }, "doraemon": { "name": "🥸 全叔", "promptText": "各位觀眾大家好,我叮噹呢又同你哋見面啦。好多謝咁多年嚟各位嘅捧場同支持。", "promptAudio": path.join(path.dirname(__file__), "./voices/doraemon3.wav") }, "周星馳": { "name": "😈 星爺", "promptText": "大家好啊,想唔想同我做好朋友啊。", "promptAudio": path.join(path.dirname(__file__), "./voices/sing.mp3") } } async def tts(input_text: str, voice: Voice) -> str: """ Send TTS request with voice information Args: input_text: Text to be converted to speech voice: Voice configuration Returns: task_id: ID of the TTS task """ files = { 'input_text': (None, input_text), 'prompt_text': (None, voice['promptText']), 'audio': ('prompt.wav', open(voice['promptAudio'], 'rb')), 'speed': (None, '1.0') } headers = { 'CF-Access-Client-Id': TTS_CLIENT_ID, 'CF-Access-Client-Secret': TTS_CLIENT_SECRET } response = requests.post(f"{TTS_API_URL}/api/tts", files=files, headers=headers) response.raise_for_status() return response.json()['task_id'] async def get_task_result(task_id: str) -> TaskResult: """ Get result of TTS task Args: task_id: ID of the TTS task Returns: Task result information """ headers = { 'Content-Type': 'application/json', 'CF-Access-Client-Id': TTS_CLIENT_ID, 'CF-Access-Client-Secret': TTS_CLIENT_SECRET } response = requests.get(f"{TTS_API_URL}/api/tts/{task_id}", headers=headers) response.raise_for_status() return response.json() async def main(): parser = argparse.ArgumentParser(description='Text-to-Speech with CosyVoice') parser.add_argument('--text', help='Text to convert to speech') parser.add_argument('--voice', '-v', choices=list(voices.keys()), default='mk_girl', help='Voice to use for synthesis') parser.add_argument('--output', '-o', default='output.wav', help='Output audio file path') args = parser.parse_args() voice = voices[args.voice] print(f"Converting text to speech using voice: {voice['name']}") print(f"Text: {args.text}") try: task_id = await tts(args.text, voice) print(f"TTS request submitted. Task ID: {task_id}") while True: result = await get_task_result(task_id) if result['status'] != 'PENDING': break print("Waiting for TTS processing...") await asyncio.sleep(1) if result['status'] == 'SUCCESS': audio_data = result['audio_url'] if ',' in audio_data: audio_data = audio_data.split(',')[1] with open(args.output, 'wb') as f: f.write(base64.b64decode(audio_data)) print(f"Audio saved to {args.output}") else: print(f"TTS generation failed: {result['message']}") except Exception as e: print(f"Error: {str(e)}") if __name__ == "__main__": asyncio.run(main())