File size: 4,351 Bytes
c005bf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import requests
import os
import io
from os import path
from typing import Dict, Literal, TypedDict, Optional
import argparse
import asyncio
import base64

# Environment variables
TTS_CLIENT_ID = os.environ.get('TTS_CLIENT_ID')
TTS_CLIENT_SECRET = os.environ.get('TTS_CLIENT_SECRET')
TTS_API_URL = os.environ.get('TTS_API_URL')


if not TTS_CLIENT_ID or not TTS_CLIENT_SECRET or not TTS_API_URL:
    raise ValueError('Missing environment variables')

class TaskResult(TypedDict):
    task_id: str
    message: str
    status: Literal['PENDING', 'SUCCESS', 'FAILED']
    audio_url: str  # base64 encoded wav audio

class Voice(TypedDict):
    name: str
    promptText: str
    promptAudio: str

voices: Dict[str, Voice] = {
    "mk_girl": {
        "name": "👧 凱婷",
        "promptText": "我決定咗啦,我要做一件到目前為止又或者永遠都唔會再見到我做嘅事。",
        "promptAudio": path.join(path.dirname(__file__), "./voices/mk_girl.wav")
    },
    "doraemon": {
        "name": "🥸 全叔",
        "promptText": "各位觀眾大家好,我叮噹呢又同你哋見面啦。好多謝咁多年嚟各位嘅捧場同支持。",
        "promptAudio": path.join(path.dirname(__file__), "./voices/doraemon3.wav")
    },
    "周星馳": {
        "name": "😈 星爺",
        "promptText": "大家好啊,想唔想同我做好朋友啊。",
        "promptAudio": path.join(path.dirname(__file__), "./voices/sing.mp3")
    }
}

async def tts(input_text: str, voice: Voice) -> str:
    """
    Send TTS request with voice information
    
    Args:
        input_text: Text to be converted to speech
        voice: Voice configuration
        
    Returns:
        task_id: ID of the TTS task
    """
    files = {
        'input_text': (None, input_text),
        'prompt_text': (None, voice['promptText']),
        'audio': ('prompt.wav', open(voice['promptAudio'], 'rb')),
        'speed': (None, '1.0')
    }
    
    headers = {
        'CF-Access-Client-Id': TTS_CLIENT_ID,
        'CF-Access-Client-Secret': TTS_CLIENT_SECRET
    }
    
    response = requests.post(f"{TTS_API_URL}/api/tts", 
                             files=files,
                             headers=headers)
    
    response.raise_for_status()
    return response.json()['task_id']

async def get_task_result(task_id: str) -> TaskResult:
    """
    Get result of TTS task
    
    Args:
        task_id: ID of the TTS task
        
    Returns:
        Task result information
    """
    headers = {
        'Content-Type': 'application/json',
        'CF-Access-Client-Id': TTS_CLIENT_ID,
        'CF-Access-Client-Secret': TTS_CLIENT_SECRET
    }
    
    response = requests.get(f"{TTS_API_URL}/api/tts/{task_id}", 
                           headers=headers)
    
    response.raise_for_status()
    return response.json()


async def main():
    parser = argparse.ArgumentParser(description='Text-to-Speech with CosyVoice')
    parser.add_argument('--text', help='Text to convert to speech')
    parser.add_argument('--voice', '-v', choices=list(voices.keys()), default='mk_girl',
                        help='Voice to use for synthesis')
    parser.add_argument('--output', '-o', default='output.wav',
                        help='Output audio file path')
    
    args = parser.parse_args()
    voice = voices[args.voice]
    
    print(f"Converting text to speech using voice: {voice['name']}")
    print(f"Text: {args.text}")
    
    try:
        task_id = await tts(args.text, voice)
        print(f"TTS request submitted. Task ID: {task_id}")
        
        while True:
            result = await get_task_result(task_id)
            if result['status'] != 'PENDING':
                break
            print("Waiting for TTS processing...")
            await asyncio.sleep(1)
        
        if result['status'] == 'SUCCESS':
            audio_data = result['audio_url']
            if ',' in audio_data:
                audio_data = audio_data.split(',')[1]
            
            with open(args.output, 'wb') as f:
                f.write(base64.b64decode(audio_data))
            print(f"Audio saved to {args.output}")
        else:
            print(f"TTS generation failed: {result['message']}")
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    asyncio.run(main())