File size: 8,274 Bytes
c02bdcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import random

import numpy as np

from tools.audio import float_to_int16


# 流式推理数据获取器,支持流式获取音频编码字节流
class ChatStreamer:
    def __init__(self, base_block_size=8000):
        self.base_block_size = base_block_size

    # stream状态更新。数据量不足的stream,先存一段时间,直到拿到足够数据,监控小块数据情况
    @staticmethod
    def _update_stream(history_stream_wav, new_stream_wav, thre):
        if history_stream_wav is not None:
            result_stream = np.concatenate([history_stream_wav, new_stream_wav], axis=1)
            is_keep_next = result_stream.shape[0] * result_stream.shape[1] < thre
            if random.random() > 0.1:
                print(
                    "update_stream",
                    is_keep_next,
                    [i.shape if i is not None else None for i in result_stream],
                )
        else:
            result_stream = new_stream_wav
            is_keep_next = result_stream.shape[0] * result_stream.shape[1] < thre

        return result_stream, is_keep_next

    # 已推理batch数据保存
    @staticmethod
    def _accum(accum_wavs, stream_wav):
        if accum_wavs is None:
            accum_wavs = stream_wav
        else:
            accum_wavs = np.concatenate([accum_wavs, stream_wav], axis=1)
        return accum_wavs

    # batch stream数据格式转化
    @staticmethod
    def batch_stream_formatted(stream_wav, output_format="PCM16_byte"):
        if output_format in ("PCM16_byte", "PCM16"):
            format_data = float_to_int16(stream_wav)
        else:
            format_data = stream_wav
        return format_data

    # 数据格式转化
    @staticmethod
    def formatted(data, output_format="PCM16_byte"):
        if output_format == "PCM16_byte":
            format_data = data.astype("<i2").tobytes()
        else:
            format_data = data
        return format_data

    # 检查声音是否为空
    @staticmethod
    def checkvoice(data):
        if np.abs(data).max() < 1e-6:
            return False
        else:
            return True

    # 将声音进行适当拆分返回
    @staticmethod
    def _subgen(data, thre=12000):
        for stard_idx in range(0, data.shape[0], thre):
            end_idx = stard_idx + thre
            yield data[stard_idx:end_idx]

    # 流式数据获取,支持获取音频编码字节流
    def generate(self, streamchat, output_format=None):
        assert output_format in ("PCM16_byte", "PCM16", None)
        curr_sentence_index = 0
        history_stream_wav = None
        article_streamwavs = None
        for stream_wav in streamchat:
            print(np.abs(stream_wav).max(axis=1))
            n_texts = len(stream_wav)
            n_valid_texts = (np.abs(stream_wav).max(axis=1) > 1e-6).sum()
            if n_valid_texts == 0:
                continue
            else:
                block_thre = n_valid_texts * self.base_block_size
                stream_wav, is_keep_next = ChatStreamer._update_stream(
                    history_stream_wav, stream_wav, block_thre
                )
                # 数据量不足,先保存状态
                if is_keep_next:
                    history_stream_wav = stream_wav
                    continue
                # 数据量足够,执行写入操作
                else:
                    history_stream_wav = None
                    stream_wav = ChatStreamer.batch_stream_formatted(
                        stream_wav, output_format
                    )
                    article_streamwavs = ChatStreamer._accum(
                        article_streamwavs, stream_wav
                    )
                    # 写入当前句子
                    if ChatStreamer.checkvoice(stream_wav[curr_sentence_index]):
                        for sub_wav in ChatStreamer._subgen(
                            stream_wav[curr_sentence_index]
                        ):
                            if ChatStreamer.checkvoice(sub_wav):
                                yield ChatStreamer.formatted(sub_wav, output_format)
                    # 当前句子已写入完成,直接写下一个句子已经推理完成的部分
                    elif curr_sentence_index < n_texts - 1:
                        curr_sentence_index += 1
                        print("add next sentence")
                        finish_stream_wavs = article_streamwavs[curr_sentence_index]

                        for sub_wav in ChatStreamer._subgen(finish_stream_wavs):
                            if ChatStreamer.checkvoice(sub_wav):
                                yield ChatStreamer.formatted(sub_wav, output_format)

                    # streamchat遍历完毕,在外层把剩余结果写入
                    else:
                        break
        # 本轮剩余最后一点数据写入
        if is_keep_next:
            if len(list(filter(lambda x: x is not None, stream_wav))) > 0:
                stream_wav = ChatStreamer.batch_stream_formatted(
                    stream_wav, output_format
                )
                if ChatStreamer.checkvoice(stream_wav[curr_sentence_index]):

                    for sub_wav in ChatStreamer._subgen(
                        stream_wav[curr_sentence_index]
                    ):
                        if ChatStreamer.checkvoice(sub_wav):
                            yield ChatStreamer.formatted(sub_wav, output_format)
                    article_streamwavs = ChatStreamer._accum(
                        article_streamwavs, stream_wav
                    )
        # 把已经完成推理的下几轮剩余数据写入
        for i_text in range(curr_sentence_index + 1, n_texts):
            finish_stream_wavs = article_streamwavs[i_text]

            for sub_wav in ChatStreamer._subgen(finish_stream_wavs):
                if ChatStreamer.checkvoice(sub_wav):
                    yield ChatStreamer.formatted(sub_wav, output_format)

    # 流式播放接口
    def play(self, streamchat, wait=5):
        import pyaudio  # please install it manually

        p = pyaudio.PyAudio()
        print(p.get_device_count())
        # 设置音频流参数
        FORMAT = pyaudio.paInt16  # 16位深度
        CHANNELS = 1  # 单声道
        RATE = 24000  # 采样率
        CHUNK = 1024  # 每块音频数据大小

        # 打开输出流(扬声器)
        stream_out = p.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            output=True,
        )

        first_prefill_size = wait * RATE
        prefill_bytes = b""
        meet = False
        for i in self.generate(streamchat, output_format="PCM16_byte"):
            if not meet:
                prefill_bytes += i
                if len(prefill_bytes) > first_prefill_size:
                    meet = True
                    stream_out.write(prefill_bytes)
            else:
                stream_out.write(i)
        if not meet:
            stream_out.write(prefill_bytes)

        stream_out.stop_stream()
        stream_out.close()


if __name__ == "__main__":
    import ChatTTS

    # 加载 ChatTTS
    chat = ChatTTS.Chat()
    chat.load(compile=False)

    rand_spk = chat.sample_random_speaker()
    params_infer_code = ChatTTS.Chat.InferCodeParams(
        spk_emb=rand_spk,  # add sampled speaker
        temperature=0.3,  # using custom temperature
        top_P=0.7,  # top P decode
        top_K=20,  # top K decode
    )

    # 获取ChatTTS 流式推理generator
    streamchat = chat.infer(
        [
            "总结一下,AI Agent是大模型功能的扩展,让AI更接近于通用人工智能,也就是我们常说的AGI。",
            "你太聪明啦。",
            "举个例子,大模型可能可以写代码,但它不能独立完成一个完整的软件开发项目。这时候,AI Agent就根据大模型的智能,结合记忆和规划,一步步实现从需求分析到产品上线。",
        ],
        skip_refine_text=True,
        stream=True,
        params_infer_code=params_infer_code,
    )
    # 先存放一部分,存的差不多了再播放,适合生成速度比较慢的cpu玩家使用
    ChatStreamer().play(streamchat, wait=5)