Spaces:
Sleeping
Sleeping
Helw150
commited on
Commit
·
3268a02
1
Parent(s):
5ded772
Multi turn
Browse files
app.py
CHANGED
@@ -26,7 +26,7 @@ resampler = Audio(sampling_rate=16_000)
|
|
26 |
|
27 |
@spaces.GPU
|
28 |
@torch.no_grad
|
29 |
-
def diva_audio(audio_input, do_sample=False, temperature=0.001):
|
30 |
sr, y = audio_input
|
31 |
x = xxhash.xxh32(bytes(y)).hexdigest()
|
32 |
y = y.astype(np.float32)
|
@@ -35,7 +35,12 @@ def diva_audio(audio_input, do_sample=False, temperature=0.001):
|
|
35 |
resampler.encode_example({"array": y, "sampling_rate": sr})
|
36 |
)
|
37 |
yield from diva_model.generate_stream(
|
38 |
-
a["array"],
|
|
|
|
|
|
|
|
|
|
|
39 |
)
|
40 |
|
41 |
|
@@ -70,7 +75,7 @@ def run_vad(ori_audio, sr):
|
|
70 |
|
71 |
|
72 |
def warm_up():
|
73 |
-
frames =
|
74 |
dur, frames, tcost = run_vad(frames, 16000)
|
75 |
print(f"warm up done, time_cost: {tcost:.3f} s")
|
76 |
|
@@ -86,6 +91,7 @@ class AppState:
|
|
86 |
started_talking: bool = False
|
87 |
stopped: bool = False
|
88 |
conversation: list = field(default_factory=list)
|
|
|
89 |
|
90 |
|
91 |
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
@@ -134,7 +140,9 @@ def response(state: AppState):
|
|
134 |
)
|
135 |
|
136 |
start = False
|
137 |
-
for resp in diva_audio(
|
|
|
|
|
138 |
if not start:
|
139 |
state.conversation.append({"role": "assistant", "content": resp})
|
140 |
start = True
|
@@ -142,7 +150,7 @@ def response(state: AppState):
|
|
142 |
state.conversation[-1]["content"] = resp
|
143 |
yield state, state.conversation
|
144 |
|
145 |
-
yield AppState(conversation=state.conversation), state.conversation
|
146 |
|
147 |
|
148 |
def start_recording_user(state: AppState):
|
|
|
26 |
|
27 |
@spaces.GPU
|
28 |
@torch.no_grad
|
29 |
+
def diva_audio(audio_input, do_sample=False, temperature=0.001, prev_outs=None):
|
30 |
sr, y = audio_input
|
31 |
x = xxhash.xxh32(bytes(y)).hexdigest()
|
32 |
y = y.astype(np.float32)
|
|
|
35 |
resampler.encode_example({"array": y, "sampling_rate": sr})
|
36 |
)
|
37 |
yield from diva_model.generate_stream(
|
38 |
+
a["array"],
|
39 |
+
None,
|
40 |
+
do_sample=do_sample,
|
41 |
+
max_new_tokens=256,
|
42 |
+
init_outputs=prev_outs,
|
43 |
+
return_outputs=True,
|
44 |
)
|
45 |
|
46 |
|
|
|
75 |
|
76 |
|
77 |
def warm_up():
|
78 |
+
frames = np.ones(2048) # 1024 frames of 2 bytes each
|
79 |
dur, frames, tcost = run_vad(frames, 16000)
|
80 |
print(f"warm up done, time_cost: {tcost:.3f} s")
|
81 |
|
|
|
91 |
started_talking: bool = False
|
92 |
stopped: bool = False
|
93 |
conversation: list = field(default_factory=list)
|
94 |
+
model_outs: any = None
|
95 |
|
96 |
|
97 |
def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
|
|
|
140 |
)
|
141 |
|
142 |
start = False
|
143 |
+
for resp, outs in diva_audio(
|
144 |
+
(state.sampling_rate, state.stream), prev_outs=state.model_outs
|
145 |
+
):
|
146 |
if not start:
|
147 |
state.conversation.append({"role": "assistant", "content": resp})
|
148 |
start = True
|
|
|
150 |
state.conversation[-1]["content"] = resp
|
151 |
yield state, state.conversation
|
152 |
|
153 |
+
yield AppState(conversation=state.conversation, model_outs=outs), state.conversation
|
154 |
|
155 |
|
156 |
def start_recording_user(state: AppState):
|