File size: 8,274 Bytes
c02bdcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import random
import numpy as np
from tools.audio import float_to_int16
# 流式推理数据获取器,支持流式获取音频编码字节流
class ChatStreamer:
def __init__(self, base_block_size=8000):
self.base_block_size = base_block_size
# stream状态更新。数据量不足的stream,先存一段时间,直到拿到足够数据,监控小块数据情况
@staticmethod
def _update_stream(history_stream_wav, new_stream_wav, thre):
if history_stream_wav is not None:
result_stream = np.concatenate([history_stream_wav, new_stream_wav], axis=1)
is_keep_next = result_stream.shape[0] * result_stream.shape[1] < thre
if random.random() > 0.1:
print(
"update_stream",
is_keep_next,
[i.shape if i is not None else None for i in result_stream],
)
else:
result_stream = new_stream_wav
is_keep_next = result_stream.shape[0] * result_stream.shape[1] < thre
return result_stream, is_keep_next
# 已推理batch数据保存
@staticmethod
def _accum(accum_wavs, stream_wav):
if accum_wavs is None:
accum_wavs = stream_wav
else:
accum_wavs = np.concatenate([accum_wavs, stream_wav], axis=1)
return accum_wavs
# batch stream数据格式转化
@staticmethod
def batch_stream_formatted(stream_wav, output_format="PCM16_byte"):
if output_format in ("PCM16_byte", "PCM16"):
format_data = float_to_int16(stream_wav)
else:
format_data = stream_wav
return format_data
# 数据格式转化
@staticmethod
def formatted(data, output_format="PCM16_byte"):
if output_format == "PCM16_byte":
format_data = data.astype("<i2").tobytes()
else:
format_data = data
return format_data
# 检查声音是否为空
@staticmethod
def checkvoice(data):
if np.abs(data).max() < 1e-6:
return False
else:
return True
# 将声音进行适当拆分返回
@staticmethod
def _subgen(data, thre=12000):
for stard_idx in range(0, data.shape[0], thre):
end_idx = stard_idx + thre
yield data[stard_idx:end_idx]
# 流式数据获取,支持获取音频编码字节流
def generate(self, streamchat, output_format=None):
assert output_format in ("PCM16_byte", "PCM16", None)
curr_sentence_index = 0
history_stream_wav = None
article_streamwavs = None
for stream_wav in streamchat:
print(np.abs(stream_wav).max(axis=1))
n_texts = len(stream_wav)
n_valid_texts = (np.abs(stream_wav).max(axis=1) > 1e-6).sum()
if n_valid_texts == 0:
continue
else:
block_thre = n_valid_texts * self.base_block_size
stream_wav, is_keep_next = ChatStreamer._update_stream(
history_stream_wav, stream_wav, block_thre
)
# 数据量不足,先保存状态
if is_keep_next:
history_stream_wav = stream_wav
continue
# 数据量足够,执行写入操作
else:
history_stream_wav = None
stream_wav = ChatStreamer.batch_stream_formatted(
stream_wav, output_format
)
article_streamwavs = ChatStreamer._accum(
article_streamwavs, stream_wav
)
# 写入当前句子
if ChatStreamer.checkvoice(stream_wav[curr_sentence_index]):
for sub_wav in ChatStreamer._subgen(
stream_wav[curr_sentence_index]
):
if ChatStreamer.checkvoice(sub_wav):
yield ChatStreamer.formatted(sub_wav, output_format)
# 当前句子已写入完成,直接写下一个句子已经推理完成的部分
elif curr_sentence_index < n_texts - 1:
curr_sentence_index += 1
print("add next sentence")
finish_stream_wavs = article_streamwavs[curr_sentence_index]
for sub_wav in ChatStreamer._subgen(finish_stream_wavs):
if ChatStreamer.checkvoice(sub_wav):
yield ChatStreamer.formatted(sub_wav, output_format)
# streamchat遍历完毕,在外层把剩余结果写入
else:
break
# 本轮剩余最后一点数据写入
if is_keep_next:
if len(list(filter(lambda x: x is not None, stream_wav))) > 0:
stream_wav = ChatStreamer.batch_stream_formatted(
stream_wav, output_format
)
if ChatStreamer.checkvoice(stream_wav[curr_sentence_index]):
for sub_wav in ChatStreamer._subgen(
stream_wav[curr_sentence_index]
):
if ChatStreamer.checkvoice(sub_wav):
yield ChatStreamer.formatted(sub_wav, output_format)
article_streamwavs = ChatStreamer._accum(
article_streamwavs, stream_wav
)
# 把已经完成推理的下几轮剩余数据写入
for i_text in range(curr_sentence_index + 1, n_texts):
finish_stream_wavs = article_streamwavs[i_text]
for sub_wav in ChatStreamer._subgen(finish_stream_wavs):
if ChatStreamer.checkvoice(sub_wav):
yield ChatStreamer.formatted(sub_wav, output_format)
# 流式播放接口
def play(self, streamchat, wait=5):
import pyaudio # please install it manually
p = pyaudio.PyAudio()
print(p.get_device_count())
# 设置音频流参数
FORMAT = pyaudio.paInt16 # 16位深度
CHANNELS = 1 # 单声道
RATE = 24000 # 采样率
CHUNK = 1024 # 每块音频数据大小
# 打开输出流(扬声器)
stream_out = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
output=True,
)
first_prefill_size = wait * RATE
prefill_bytes = b""
meet = False
for i in self.generate(streamchat, output_format="PCM16_byte"):
if not meet:
prefill_bytes += i
if len(prefill_bytes) > first_prefill_size:
meet = True
stream_out.write(prefill_bytes)
else:
stream_out.write(i)
if not meet:
stream_out.write(prefill_bytes)
stream_out.stop_stream()
stream_out.close()
if __name__ == "__main__":
import ChatTTS
# 加载 ChatTTS
chat = ChatTTS.Chat()
chat.load(compile=False)
rand_spk = chat.sample_random_speaker()
params_infer_code = ChatTTS.Chat.InferCodeParams(
spk_emb=rand_spk, # add sampled speaker
temperature=0.3, # using custom temperature
top_P=0.7, # top P decode
top_K=20, # top K decode
)
# 获取ChatTTS 流式推理generator
streamchat = chat.infer(
[
"总结一下,AI Agent是大模型功能的扩展,让AI更接近于通用人工智能,也就是我们常说的AGI。",
"你太聪明啦。",
"举个例子,大模型可能可以写代码,但它不能独立完成一个完整的软件开发项目。这时候,AI Agent就根据大模型的智能,结合记忆和规划,一步步实现从需求分析到产品上线。",
],
skip_refine_text=True,
stream=True,
params_infer_code=params_infer_code,
)
# 先存放一部分,存的差不多了再播放,适合生成速度比较慢的cpu玩家使用
ChatStreamer().play(streamchat, wait=5)
|