Spaces:
Running
Running
martin
commited on
Commit
·
930f36f
1
Parent(s):
ed26be9
fix aqta predict
Browse files- app.py +18 -9
- stepaudio.py +6 -2
app.py
CHANGED
@@ -34,7 +34,7 @@ class CustomAsr:
|
|
34 |
return text
|
35 |
|
36 |
|
37 |
-
def add_message(chatbot, history, mic, text
|
38 |
if not mic and not text:
|
39 |
return chatbot, history, "Input is empty"
|
40 |
|
@@ -43,10 +43,7 @@ def add_message(chatbot, history, mic, text, asr_model):
|
|
43 |
history.append({"role": "user", "content": text})
|
44 |
elif mic and Path(mic).exists():
|
45 |
chatbot.append({"role": "user", "content": {"path": mic}})
|
46 |
-
|
47 |
-
text = asr_model.run(mic)
|
48 |
-
chatbot.append({"role": "user", "content": text})
|
49 |
-
history.append({"role": "user", "content": text})
|
50 |
|
51 |
print(f"{history=}")
|
52 |
return chatbot, history, None
|
@@ -69,12 +66,24 @@ def save_tmp_audio(audio, sr):
|
|
69 |
return temp_audio.name
|
70 |
|
71 |
|
72 |
-
def predict(chatbot, history, audio_model):
|
73 |
"""Generate a response from the model."""
|
74 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
text, audio, sr = audio_model(history, "闫雨婷")
|
76 |
print(f"predict {text=}")
|
77 |
audio_path = save_tmp_audio(audio, sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
chatbot.append({"role": "assistant", "content": {"path": audio_path}})
|
79 |
chatbot.append({"role": "assistant", "content": text})
|
80 |
history.append({"role": "assistant", "content": text})
|
@@ -105,13 +114,13 @@ def _launch_demo(args, audio_model, asr_model):
|
|
105 |
|
106 |
def on_submit(chatbot, history, mic, text):
|
107 |
chatbot, history, error = add_message(
|
108 |
-
chatbot, history, mic, text
|
109 |
)
|
110 |
if error:
|
111 |
gr.Warning(error) # 显示警告消息
|
112 |
return chatbot, history, None, None
|
113 |
else:
|
114 |
-
chatbot, history = predict(chatbot, history, audio_model)
|
115 |
return chatbot, history, None, None
|
116 |
|
117 |
submit_btn.click(
|
@@ -133,7 +142,7 @@ def _launch_demo(args, audio_model, asr_model):
|
|
133 |
while history and history[-1]["role"] == "assistant":
|
134 |
print(f"discard {history[-1]}")
|
135 |
history.pop()
|
136 |
-
return predict(chatbot, history, audio_model)
|
137 |
|
138 |
regen_btn.click(
|
139 |
regenerate,
|
|
|
34 |
return text
|
35 |
|
36 |
|
37 |
+
def add_message(chatbot, history, mic, text):
|
38 |
if not mic and not text:
|
39 |
return chatbot, history, "Input is empty"
|
40 |
|
|
|
43 |
history.append({"role": "user", "content": text})
|
44 |
elif mic and Path(mic).exists():
|
45 |
chatbot.append({"role": "user", "content": {"path": mic}})
|
46 |
+
history.append({"role": "user", "content": {"type":"audio", "audio": mic}})
|
|
|
|
|
|
|
47 |
|
48 |
print(f"{history=}")
|
49 |
return chatbot, history, None
|
|
|
66 |
return temp_audio.name
|
67 |
|
68 |
|
69 |
+
def predict(chatbot, history, audio_model, asr_model):
|
70 |
"""Generate a response from the model."""
|
71 |
try:
|
72 |
+
is_input_audio = False
|
73 |
+
user_audio_path = None
|
74 |
+
# 检测用户输入的是音频还是文本
|
75 |
+
if isinstance(history[-1]["content"], dict):
|
76 |
+
is_input_audio = True
|
77 |
+
user_audio_path = history[-1]["content"]["audio"]
|
78 |
text, audio, sr = audio_model(history, "闫雨婷")
|
79 |
print(f"predict {text=}")
|
80 |
audio_path = save_tmp_audio(audio, sr)
|
81 |
+
# 缓存用户语音的 asr 文本结果为了加速下一次推理
|
82 |
+
if is_input_audio:
|
83 |
+
asr_text = asr_model.run(user_audio_path)
|
84 |
+
chatbot.append({"role": "user", "content": asr_text})
|
85 |
+
history[-1]["content"] = asr_text
|
86 |
+
print(f"{asr_text=}")
|
87 |
chatbot.append({"role": "assistant", "content": {"path": audio_path}})
|
88 |
chatbot.append({"role": "assistant", "content": text})
|
89 |
history.append({"role": "assistant", "content": text})
|
|
|
114 |
|
115 |
def on_submit(chatbot, history, mic, text):
|
116 |
chatbot, history, error = add_message(
|
117 |
+
chatbot, history, mic, text
|
118 |
)
|
119 |
if error:
|
120 |
gr.Warning(error) # 显示警告消息
|
121 |
return chatbot, history, None, None
|
122 |
else:
|
123 |
+
chatbot, history = predict(chatbot, history, audio_model, asr_model)
|
124 |
return chatbot, history, None, None
|
125 |
|
126 |
submit_btn.click(
|
|
|
142 |
while history and history[-1]["role"] == "assistant":
|
143 |
print(f"discard {history[-1]}")
|
144 |
history.pop()
|
145 |
+
return predict(chatbot, history, audio_model, asr_model)
|
146 |
|
147 |
regen_btn.click(
|
148 |
regenerate,
|
stepaudio.py
CHANGED
@@ -42,6 +42,11 @@ class StepAudio:
|
|
42 |
output_audio = volumn_adjust(output_audio, volumn_ratio)
|
43 |
return output_text, output_audio, sr
|
44 |
|
|
|
|
|
|
|
|
|
|
|
45 |
def apply_chat_template(self, messages: list):
|
46 |
text_with_audio = ""
|
47 |
for msg in messages:
|
@@ -55,8 +60,7 @@ class StepAudio:
|
|
55 |
if content["type"] == "text":
|
56 |
text_with_audio += f"<|BOT|>{role}\n{content['text']}<|EOT|>"
|
57 |
elif content["type"] == "audio":
|
58 |
-
|
59 |
-
audio_tokens = self.encoder(audio_wav, sr)
|
60 |
text_with_audio += f"<|BOT|>{role}\n{audio_tokens}<|EOT|>"
|
61 |
elif content is None:
|
62 |
text_with_audio += f"<|BOT|>{role}\n"
|
|
|
42 |
output_audio = volumn_adjust(output_audio, volumn_ratio)
|
43 |
return output_text, output_audio, sr
|
44 |
|
45 |
+
def encode_audio(self, audio_path):
|
46 |
+
audio_wav, sr = load_audio(audio_path)
|
47 |
+
audio_tokens = self.encoder(audio_wav, sr)
|
48 |
+
return audio_tokens
|
49 |
+
|
50 |
def apply_chat_template(self, messages: list):
|
51 |
text_with_audio = ""
|
52 |
for msg in messages:
|
|
|
60 |
if content["type"] == "text":
|
61 |
text_with_audio += f"<|BOT|>{role}\n{content['text']}<|EOT|>"
|
62 |
elif content["type"] == "audio":
|
63 |
+
audio_tokens = self.encode_audio(content["audio"])
|
|
|
64 |
text_with_audio += f"<|BOT|>{role}\n{audio_tokens}<|EOT|>"
|
65 |
elif content is None:
|
66 |
text_with_audio += f"<|BOT|>{role}\n"
|