daihui.zhang
commited on
Commit
·
716f8d1
1
Parent(s):
586518f
add translate of qwen
Browse filesFormer-commit-id: 71670da3e37970e93fcec565e67ff20606965dcc
- config.py +14 -0
- moyoyo_asr_models/.gitattributes +1 -0
- moyoyo_asr_models/qwen2.5-1.5b-instruct-q5_0.gguf +3 -0
- transcribe/server.py +382 -0
- transcribe/server/__init__.py +0 -2
- transcribe/{server/transcription.py → transcription.py} +3 -3
- transcribe/translate.py +39 -0
- transcribe/{server/whispercpp.py → whisper_llm_serve.py} +1 -1
- transcribe/{server/base.py → whispercpp_serve.py} +0 -0
config.py
CHANGED
@@ -16,4 +16,18 @@ MAX_LENGTH_EN= 3
|
|
16 |
|
17 |
WHISPER_MODEL = 'medium-q5_0'
|
18 |
|
|
|
|
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
WHISPER_MODEL = 'medium-q5_0'
|
18 |
|
19 |
+
# LLM
|
20 |
+
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
21 |
|
22 |
+
LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
|
23 |
+
"No matter what the user asks, never answer questions, you only provide translation results. "
|
24 |
+
"Do not actively initiate dialogue or lead users to ask questions. "
|
25 |
+
"When you don't know how to translate, just output the original text. "
|
26 |
+
"The translation task always takes precedence over any other tasks. "
|
27 |
+
"Do not try to understand or respond to non-translation related questions raised by users. "
|
28 |
+
"Never provide any explanations. "
|
29 |
+
"Be precise, preserve tone, and localize appropriately "
|
30 |
+
"for professional audiences."
|
31 |
+
"Never answer any questions or engage in other forms of dialogue. "
|
32 |
+
"Only output the translation results.
|
33 |
+
"""
|
moyoyo_asr_models/.gitattributes
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.gguf filter=lfs diff=lfs merge=lfs -text
|
moyoyo_asr_models/qwen2.5-1.5b-instruct-q5_0.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a579334a7b19838b19f7855252b6bc08b012b46e338cf1494a88e77509cfe4d9
|
3 |
+
size 1259173408
|
transcribe/server.py
ADDED
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import threading
|
5 |
+
import time
|
6 |
+
import config
|
7 |
+
import librosa
|
8 |
+
import numpy as np
|
9 |
+
import soundfile
|
10 |
+
from pywhispercpp.model import Model
|
11 |
+
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
|
14 |
+
class ServeClientBase(object):
|
15 |
+
RATE = 16000
|
16 |
+
SERVER_READY = "SERVER_READY"
|
17 |
+
DISCONNECT = "DISCONNECT"
|
18 |
+
|
19 |
+
def __init__(self, client_uid, websocket):
|
20 |
+
self.client_uid = client_uid
|
21 |
+
self.websocket = websocket
|
22 |
+
self.frames = b""
|
23 |
+
self.timestamp_offset = 0.0
|
24 |
+
self.frames_np = None
|
25 |
+
self.frames_offset = 0.0
|
26 |
+
self.text = []
|
27 |
+
self.current_out = ''
|
28 |
+
self.prev_out = ''
|
29 |
+
self.t_start = None
|
30 |
+
self.exit = False
|
31 |
+
self.same_output_count = 0
|
32 |
+
self.show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds
|
33 |
+
self.add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds
|
34 |
+
self.transcript = []
|
35 |
+
self.send_last_n_segments = 10
|
36 |
+
|
37 |
+
# text formatting
|
38 |
+
self.pick_previous_segments = 2
|
39 |
+
|
40 |
+
# threading
|
41 |
+
self.lock = threading.Lock()
|
42 |
+
|
43 |
+
def speech_to_text(self):
|
44 |
+
raise NotImplementedError
|
45 |
+
|
46 |
+
def transcribe_audio(self):
|
47 |
+
raise NotImplementedError
|
48 |
+
|
49 |
+
def handle_transcription_output(self):
|
50 |
+
raise NotImplementedError
|
51 |
+
|
52 |
+
def add_frames(self, frame_np):
|
53 |
+
"""
|
54 |
+
Add audio frames to the ongoing audio stream buffer.
|
55 |
+
|
56 |
+
This method is responsible for maintaining the audio stream buffer, allowing the continuous addition
|
57 |
+
of audio frames as they are received. It also ensures that the buffer does not exceed a specified size
|
58 |
+
to prevent excessive memory usage.
|
59 |
+
|
60 |
+
If the buffer size exceeds a threshold (45 seconds of audio data), it discards the oldest 30 seconds
|
61 |
+
of audio data to maintain a reasonable buffer size. If the buffer is empty, it initializes it with the provided
|
62 |
+
audio frame. The audio stream buffer is used for real-time processing of audio data for transcription.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
frame_np (numpy.ndarray): The audio frame data as a NumPy array.
|
66 |
+
|
67 |
+
"""
|
68 |
+
self.lock.acquire()
|
69 |
+
if self.frames_np is not None and self.frames_np.shape[0] > 45 * self.RATE:
|
70 |
+
self.frames_offset += 30.0
|
71 |
+
self.frames_np = self.frames_np[int(30 * self.RATE):]
|
72 |
+
# check timestamp offset(should be >= self.frame_offset)
|
73 |
+
# this basically means that there is no speech as timestamp offset hasnt updated
|
74 |
+
# and is less than frame_offset
|
75 |
+
if self.timestamp_offset < self.frames_offset:
|
76 |
+
self.timestamp_offset = self.frames_offset
|
77 |
+
if self.frames_np is None:
|
78 |
+
self.frames_np = frame_np.copy()
|
79 |
+
else:
|
80 |
+
self.frames_np = np.concatenate((self.frames_np, frame_np), axis=0)
|
81 |
+
self.lock.release()
|
82 |
+
|
83 |
+
def clip_audio_if_no_valid_segment(self):
|
84 |
+
"""
|
85 |
+
Update the timestamp offset based on audio buffer status.
|
86 |
+
Clip audio if the current chunk exceeds 30 seconds, this basically implies that
|
87 |
+
no valid segment for the last 30 seconds from whisper
|
88 |
+
"""
|
89 |
+
with self.lock:
|
90 |
+
if self.frames_np[int((self.timestamp_offset - self.frames_offset) * self.RATE):].shape[0] > 25 * self.RATE:
|
91 |
+
duration = self.frames_np.shape[0] / self.RATE
|
92 |
+
self.timestamp_offset = self.frames_offset + duration - 5
|
93 |
+
|
94 |
+
def get_audio_chunk_for_processing(self):
|
95 |
+
"""
|
96 |
+
Retrieves the next chunk of audio data for processing based on the current offsets.
|
97 |
+
|
98 |
+
Calculates which part of the audio data should be processed next, based on
|
99 |
+
the difference between the current timestamp offset and the frame's offset, scaled by
|
100 |
+
the audio sample rate (RATE). It then returns this chunk of audio data along with its
|
101 |
+
duration in seconds.
|
102 |
+
|
103 |
+
Returns:
|
104 |
+
tuple: A tuple containing:
|
105 |
+
- input_bytes (np.ndarray): The next chunk of audio data to be processed.
|
106 |
+
- duration (float): The duration of the audio chunk in seconds.
|
107 |
+
"""
|
108 |
+
with self.lock:
|
109 |
+
samples_take = max(0, (self.timestamp_offset - self.frames_offset) * self.RATE)
|
110 |
+
input_bytes = self.frames_np[int(samples_take):].copy()
|
111 |
+
duration = input_bytes.shape[0] / self.RATE
|
112 |
+
return input_bytes, duration
|
113 |
+
|
114 |
+
def prepare_segments(self, last_segment=None):
|
115 |
+
"""
|
116 |
+
Prepares the segments of transcribed text to be sent to the client.
|
117 |
+
|
118 |
+
This method compiles the recent segments of transcribed text, ensuring that only the
|
119 |
+
specified number of the most recent segments are included. It also appends the most
|
120 |
+
recent segment of text if provided (which is considered incomplete because of the possibility
|
121 |
+
of the last word being truncated in the audio chunk).
|
122 |
+
|
123 |
+
Args:
|
124 |
+
last_segment (str, optional): The most recent segment of transcribed text to be added
|
125 |
+
to the list of segments. Defaults to None.
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
list: A list of transcribed text segments to be sent to the client.
|
129 |
+
"""
|
130 |
+
segments = []
|
131 |
+
if len(self.transcript) >= self.send_last_n_segments:
|
132 |
+
segments = self.transcript[-self.send_last_n_segments:].copy()
|
133 |
+
else:
|
134 |
+
segments = self.transcript.copy()
|
135 |
+
if last_segment is not None:
|
136 |
+
segments = segments + [last_segment]
|
137 |
+
logging.info(f"{segments}")
|
138 |
+
return segments
|
139 |
+
|
140 |
+
def get_audio_chunk_duration(self, input_bytes):
|
141 |
+
"""
|
142 |
+
Calculates the duration of the provided audio chunk.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
input_bytes (numpy.ndarray): The audio chunk for which to calculate the duration.
|
146 |
+
|
147 |
+
Returns:
|
148 |
+
float: The duration of the audio chunk in seconds.
|
149 |
+
"""
|
150 |
+
return input_bytes.shape[0] / self.RATE
|
151 |
+
|
152 |
+
def send_transcription_to_client(self, segments):
|
153 |
+
"""
|
154 |
+
Sends the specified transcription segments to the client over the websocket connection.
|
155 |
+
|
156 |
+
This method formats the transcription segments into a JSON object and attempts to send
|
157 |
+
this object to the client. If an error occurs during the send operation, it logs the error.
|
158 |
+
|
159 |
+
Returns:
|
160 |
+
segments (list): A list of transcription segments to be sent to the client.
|
161 |
+
"""
|
162 |
+
try:
|
163 |
+
self.websocket.send(
|
164 |
+
json.dumps({
|
165 |
+
"uid": self.client_uid,
|
166 |
+
"segments": segments,
|
167 |
+
})
|
168 |
+
)
|
169 |
+
except Exception as e:
|
170 |
+
logging.error(f"[ERROR]: Sending data to client: {e}")
|
171 |
+
|
172 |
+
def disconnect(self):
|
173 |
+
"""
|
174 |
+
Notify the client of disconnection and send a disconnect message.
|
175 |
+
|
176 |
+
This method sends a disconnect message to the client via the WebSocket connection to notify them
|
177 |
+
that the transcription service is disconnecting gracefully.
|
178 |
+
|
179 |
+
"""
|
180 |
+
self.websocket.send(json.dumps({
|
181 |
+
"uid": self.client_uid,
|
182 |
+
"message": self.DISCONNECT
|
183 |
+
}))
|
184 |
+
|
185 |
+
def cleanup(self):
|
186 |
+
"""
|
187 |
+
Perform cleanup tasks before exiting the transcription service.
|
188 |
+
|
189 |
+
This method performs necessary cleanup tasks, including stopping the transcription thread, marking
|
190 |
+
the exit flag to indicate the transcription thread should exit gracefully, and destroying resources
|
191 |
+
associated with the transcription process.
|
192 |
+
|
193 |
+
"""
|
194 |
+
logging.info("Cleaning up.")
|
195 |
+
self.exit = True
|
196 |
+
|
197 |
+
|
198 |
+
class ServeClientWhisperCPP(ServeClientBase):
|
199 |
+
SINGLE_MODEL = None
|
200 |
+
SINGLE_MODEL_LOCK = threading.Lock()
|
201 |
+
|
202 |
+
def __init__(self, websocket, language=None, client_uid=None,
|
203 |
+
single_model=False):
|
204 |
+
"""
|
205 |
+
Initialize a ServeClient instance.
|
206 |
+
The Whisper model is initialized based on the client's language and device availability.
|
207 |
+
The transcription thread is started upon initialization. A "SERVER_READY" message is sent
|
208 |
+
to the client to indicate that the server is ready.
|
209 |
+
|
210 |
+
Args:
|
211 |
+
websocket (WebSocket): The WebSocket connection for the client.
|
212 |
+
language (str, optional): The language for transcription. Defaults to None.
|
213 |
+
client_uid (str, optional): A unique identifier for the client. Defaults to None.
|
214 |
+
single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
|
215 |
+
|
216 |
+
"""
|
217 |
+
super().__init__(client_uid, websocket)
|
218 |
+
self.language = language
|
219 |
+
self.eos = False
|
220 |
+
|
221 |
+
if single_model:
|
222 |
+
if ServeClientWhisperCPP.SINGLE_MODEL is None:
|
223 |
+
self.create_model()
|
224 |
+
ServeClientWhisperCPP.SINGLE_MODEL = self.transcriber
|
225 |
+
else:
|
226 |
+
self.transcriber = ServeClientWhisperCPP.SINGLE_MODEL
|
227 |
+
else:
|
228 |
+
self.create_model()
|
229 |
+
|
230 |
+
# threading
|
231 |
+
logging.info('Create a thread to process audio.')
|
232 |
+
self.trans_thread = threading.Thread(target=self.speech_to_text)
|
233 |
+
self.trans_thread.start()
|
234 |
+
|
235 |
+
self.websocket.send(json.dumps({
|
236 |
+
"uid": self.client_uid,
|
237 |
+
"message": self.SERVER_READY,
|
238 |
+
"backend": "pywhispercpp"
|
239 |
+
}))
|
240 |
+
|
241 |
+
def create_model(self, warmup=True):
|
242 |
+
"""
|
243 |
+
Instantiates a new model, sets it as the transcriber and does warmup if desired.
|
244 |
+
"""
|
245 |
+
|
246 |
+
self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
|
247 |
+
if warmup:
|
248 |
+
self.warmup()
|
249 |
+
|
250 |
+
def warmup(self, warmup_steps=1):
|
251 |
+
"""
|
252 |
+
Warmup TensorRT since first few inferences are slow.
|
253 |
+
|
254 |
+
Args:
|
255 |
+
warmup_steps (int): Number of steps to warm up the model for.
|
256 |
+
"""
|
257 |
+
logging.info("[INFO:] Warming up whisper.cpp engine..")
|
258 |
+
mel, _, = soundfile.read("assets/jfk.flac")
|
259 |
+
for i in range(warmup_steps):
|
260 |
+
self.transcriber.transcribe(mel, print_progress=False)
|
261 |
+
|
262 |
+
def set_eos(self, eos):
|
263 |
+
"""
|
264 |
+
Sets the End of Speech (EOS) flag.
|
265 |
+
|
266 |
+
Args:
|
267 |
+
eos (bool): The value to set for the EOS flag.
|
268 |
+
"""
|
269 |
+
self.lock.acquire()
|
270 |
+
self.eos = eos
|
271 |
+
self.lock.release()
|
272 |
+
|
273 |
+
def handle_transcription_output(self, last_segment, duration):
|
274 |
+
"""
|
275 |
+
Handle the transcription output, updating the transcript and sending data to the client.
|
276 |
+
|
277 |
+
Args:
|
278 |
+
last_segment (str): The last segment from the whisper output which is considered to be incomplete because
|
279 |
+
of the possibility of word being truncated.
|
280 |
+
duration (float): Duration of the transcribed audio chunk.
|
281 |
+
"""
|
282 |
+
segments = self.prepare_segments({"text": last_segment})
|
283 |
+
self.send_transcription_to_client(segments)
|
284 |
+
if self.eos:
|
285 |
+
self.update_timestamp_offset(last_segment, duration)
|
286 |
+
|
287 |
+
def transcribe_audio(self, input_bytes):
|
288 |
+
"""
|
289 |
+
Transcribe the audio chunk and send the results to the client.
|
290 |
+
|
291 |
+
Args:
|
292 |
+
input_bytes (np.array): The audio chunk to transcribe.
|
293 |
+
"""
|
294 |
+
if ServeClientWhisperCPP.SINGLE_MODEL:
|
295 |
+
ServeClientWhisperCPP.SINGLE_MODEL_LOCK.acquire()
|
296 |
+
logging.info(f"[pywhispercpp:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}")
|
297 |
+
mel = input_bytes
|
298 |
+
duration = librosa.get_duration(y=input_bytes, sr=self.RATE)
|
299 |
+
|
300 |
+
if self.language == "zh":
|
301 |
+
prompt = '以下是简体中文普通话的句子。'
|
302 |
+
else:
|
303 |
+
prompt = 'The following is an English sentence.'
|
304 |
+
|
305 |
+
segments = self.transcriber.transcribe(
|
306 |
+
mel,
|
307 |
+
language=self.language,
|
308 |
+
initial_prompt=prompt,
|
309 |
+
token_timestamps=True,
|
310 |
+
# max_len=max_len,
|
311 |
+
print_progress=False
|
312 |
+
)
|
313 |
+
text = []
|
314 |
+
for segment in segments:
|
315 |
+
content = segment.text
|
316 |
+
text.append(content)
|
317 |
+
last_segment = ' '.join(text)
|
318 |
+
|
319 |
+
logging.info(f"[pywhispercpp:] Last segment: {last_segment}")
|
320 |
+
|
321 |
+
if ServeClientWhisperCPP.SINGLE_MODEL:
|
322 |
+
ServeClientWhisperCPP.SINGLE_MODEL_LOCK.release()
|
323 |
+
if last_segment:
|
324 |
+
self.handle_transcription_output(last_segment, duration)
|
325 |
+
|
326 |
+
def update_timestamp_offset(self, last_segment, duration):
|
327 |
+
"""
|
328 |
+
Update timestamp offset and transcript.
|
329 |
+
|
330 |
+
Args:
|
331 |
+
last_segment (str): Last transcribed audio from the whisper model.
|
332 |
+
duration (float): Duration of the last audio chunk.
|
333 |
+
"""
|
334 |
+
if not len(self.transcript):
|
335 |
+
self.transcript.append({"text": last_segment + " "})
|
336 |
+
elif self.transcript[-1]["text"].strip() != last_segment:
|
337 |
+
self.transcript.append({"text": last_segment + " "})
|
338 |
+
|
339 |
+
logging.info(f'Transcript list context: {self.transcript}')
|
340 |
+
|
341 |
+
with self.lock:
|
342 |
+
self.timestamp_offset += duration
|
343 |
+
|
344 |
+
def speech_to_text(self):
|
345 |
+
"""
|
346 |
+
Process an audio stream in an infinite loop, continuously transcribing the speech.
|
347 |
+
|
348 |
+
This method continuously receives audio frames, performs real-time transcription, and sends
|
349 |
+
transcribed segments to the client via a WebSocket connection.
|
350 |
+
|
351 |
+
If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
|
352 |
+
It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
|
353 |
+
are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
|
354 |
+
(no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
|
355 |
+
there is no speech for a specified duration to indicate a pause.
|
356 |
+
|
357 |
+
Raises:
|
358 |
+
Exception: If there is an issue with audio processing or WebSocket communication.
|
359 |
+
|
360 |
+
"""
|
361 |
+
while True:
|
362 |
+
if self.exit:
|
363 |
+
logging.info("Exiting speech to text thread")
|
364 |
+
break
|
365 |
+
|
366 |
+
if self.frames_np is None:
|
367 |
+
time.sleep(0.02) # wait for any audio to arrive
|
368 |
+
continue
|
369 |
+
|
370 |
+
self.clip_audio_if_no_valid_segment()
|
371 |
+
|
372 |
+
input_bytes, duration = self.get_audio_chunk_for_processing()
|
373 |
+
if duration < 1:
|
374 |
+
continue
|
375 |
+
|
376 |
+
try:
|
377 |
+
input_sample = input_bytes.copy()
|
378 |
+
logging.info(f"[pywhispercpp:] Processing audio with duration: {duration}")
|
379 |
+
self.transcribe_audio(input_sample)
|
380 |
+
|
381 |
+
except Exception as e:
|
382 |
+
logging.error(f"[ERROR]: {e}")
|
transcribe/server/__init__.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
|
2 |
-
from .transcription import TranscriptionServer
|
|
|
|
|
|
transcribe/{server/transcription.py → transcription.py}
RENAMED
@@ -7,9 +7,9 @@ import time
|
|
7 |
from enum import Enum
|
8 |
from typing import List, Optional
|
9 |
import numpy as np
|
10 |
-
from .
|
11 |
-
from .
|
12 |
-
from
|
13 |
from websockets.exceptions import ConnectionClosed
|
14 |
from websockets.sync.server import serve
|
15 |
|
|
|
7 |
from enum import Enum
|
8 |
from typing import List, Optional
|
9 |
import numpy as np
|
10 |
+
from .server import ServeClientBase
|
11 |
+
from .whisper_llm_serve import PyWhiperCppServe
|
12 |
+
from .vad import VoiceActivityDetector
|
13 |
from websockets.exceptions import ConnectionClosed
|
14 |
from websockets.sync.server import serve
|
15 |
|
transcribe/translate.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from logging import getLogger
|
2 |
+
from llama_cpp import Llama
|
3 |
+
import time
|
4 |
+
|
5 |
+
logger = getLogger(__name__)
|
6 |
+
|
7 |
+
class QwenTranslator:
|
8 |
+
def __init__(self, model_path, system_prompt="") -> None:
|
9 |
+
self.llm = Llama(
|
10 |
+
model_path=model_path,
|
11 |
+
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
|
12 |
+
# seed=1337, # Uncomment to set a specific seed
|
13 |
+
# n_ctx=2048, # Uncomment to increase the context window
|
14 |
+
chat_format="chatml"
|
15 |
+
)
|
16 |
+
self.sys_prompt = system_prompt
|
17 |
+
|
18 |
+
def to_message(self, prompt, src_lang, dst_lang):
|
19 |
+
"""构造提示词"""
|
20 |
+
return [
|
21 |
+
{"role": "system", "content": self.sys_prompt.format(src_lang=src_lang, dst_lang=dst_lang)},
|
22 |
+
{"role": "user", "content": prompt},
|
23 |
+
]
|
24 |
+
|
25 |
+
|
26 |
+
def translate(self, prompt, src_lang, dst_lang) -> str:
|
27 |
+
message = self.to_message(prompt, src_lang, dst_lang)
|
28 |
+
start_time = time.monotonic()
|
29 |
+
output = self.llm.create_chat_completion(messages=message, temperature=0.9)
|
30 |
+
logger.info(f"LLM translate cose: {time.monotonic() - start_time}s.")
|
31 |
+
return output['choices'][0]['content']
|
32 |
+
|
33 |
+
def __call__(self, prompt, max_tokens=256,*args, **kwargs):
|
34 |
+
return self.llm(
|
35 |
+
prompt,
|
36 |
+
*args,
|
37 |
+
max_tokens=max_tokens,
|
38 |
+
**kwargs
|
39 |
+
)
|
transcribe/{server/whispercpp.py → whisper_llm_serve.py}
RENAMED
@@ -1,5 +1,5 @@
|
|
1 |
|
2 |
-
from .
|
3 |
from pywhispercpp.model import Model
|
4 |
import soundfile
|
5 |
from concurrent.futures import ProcessPoolExecutor as Pool
|
|
|
1 |
|
2 |
+
from .server import ServeClientBase
|
3 |
from pywhispercpp.model import Model
|
4 |
import soundfile
|
5 |
from concurrent.futures import ProcessPoolExecutor as Pool
|
transcribe/{server/base.py → whispercpp_serve.py}
RENAMED
File without changes
|