daihui.zhang commited on
Commit
716f8d1
·
1 Parent(s): 586518f

add translate of qwen

Browse files

Former-commit-id: 71670da3e37970e93fcec565e67ff20606965dcc

config.py CHANGED
@@ -16,4 +16,18 @@ MAX_LENGTH_EN= 3
16
 
17
  WHISPER_MODEL = 'medium-q5_0'
18
 
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  WHISPER_MODEL = 'medium-q5_0'
18
 
19
+ # LLM
20
+ LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
21
 
22
+ LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
23
+ "No matter what the user asks, never answer questions, you only provide translation results. "
24
+ "Do not actively initiate dialogue or lead users to ask questions. "
25
+ "When you don't know how to translate, just output the original text. "
26
+ "The translation task always takes precedence over any other tasks. "
27
+ "Do not try to understand or respond to non-translation related questions raised by users. "
28
+ "Never provide any explanations. "
29
+ "Be precise, preserve tone, and localize appropriately "
30
+ "for professional audiences."
31
+ "Never answer any questions or engage in other forms of dialogue. "
32
+ "Only output the translation results.
33
+ """
moyoyo_asr_models/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.gguf filter=lfs diff=lfs merge=lfs -text
moyoyo_asr_models/qwen2.5-1.5b-instruct-q5_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a579334a7b19838b19f7855252b6bc08b012b46e338cf1494a88e77509cfe4d9
3
+ size 1259173408
transcribe/server.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import logging
4
+ import threading
5
+ import time
6
+ import config
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile
10
+ from pywhispercpp.model import Model
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+ class ServeClientBase(object):
15
+ RATE = 16000
16
+ SERVER_READY = "SERVER_READY"
17
+ DISCONNECT = "DISCONNECT"
18
+
19
+ def __init__(self, client_uid, websocket):
20
+ self.client_uid = client_uid
21
+ self.websocket = websocket
22
+ self.frames = b""
23
+ self.timestamp_offset = 0.0
24
+ self.frames_np = None
25
+ self.frames_offset = 0.0
26
+ self.text = []
27
+ self.current_out = ''
28
+ self.prev_out = ''
29
+ self.t_start = None
30
+ self.exit = False
31
+ self.same_output_count = 0
32
+ self.show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds
33
+ self.add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds
34
+ self.transcript = []
35
+ self.send_last_n_segments = 10
36
+
37
+ # text formatting
38
+ self.pick_previous_segments = 2
39
+
40
+ # threading
41
+ self.lock = threading.Lock()
42
+
43
+ def speech_to_text(self):
44
+ raise NotImplementedError
45
+
46
+ def transcribe_audio(self):
47
+ raise NotImplementedError
48
+
49
+ def handle_transcription_output(self):
50
+ raise NotImplementedError
51
+
52
+ def add_frames(self, frame_np):
53
+ """
54
+ Add audio frames to the ongoing audio stream buffer.
55
+
56
+ This method is responsible for maintaining the audio stream buffer, allowing the continuous addition
57
+ of audio frames as they are received. It also ensures that the buffer does not exceed a specified size
58
+ to prevent excessive memory usage.
59
+
60
+ If the buffer size exceeds a threshold (45 seconds of audio data), it discards the oldest 30 seconds
61
+ of audio data to maintain a reasonable buffer size. If the buffer is empty, it initializes it with the provided
62
+ audio frame. The audio stream buffer is used for real-time processing of audio data for transcription.
63
+
64
+ Args:
65
+ frame_np (numpy.ndarray): The audio frame data as a NumPy array.
66
+
67
+ """
68
+ self.lock.acquire()
69
+ if self.frames_np is not None and self.frames_np.shape[0] > 45 * self.RATE:
70
+ self.frames_offset += 30.0
71
+ self.frames_np = self.frames_np[int(30 * self.RATE):]
72
+ # check timestamp offset(should be >= self.frame_offset)
73
+ # this basically means that there is no speech as timestamp offset hasnt updated
74
+ # and is less than frame_offset
75
+ if self.timestamp_offset < self.frames_offset:
76
+ self.timestamp_offset = self.frames_offset
77
+ if self.frames_np is None:
78
+ self.frames_np = frame_np.copy()
79
+ else:
80
+ self.frames_np = np.concatenate((self.frames_np, frame_np), axis=0)
81
+ self.lock.release()
82
+
83
+ def clip_audio_if_no_valid_segment(self):
84
+ """
85
+ Update the timestamp offset based on audio buffer status.
86
+ Clip audio if the current chunk exceeds 30 seconds, this basically implies that
87
+ no valid segment for the last 30 seconds from whisper
88
+ """
89
+ with self.lock:
90
+ if self.frames_np[int((self.timestamp_offset - self.frames_offset) * self.RATE):].shape[0] > 25 * self.RATE:
91
+ duration = self.frames_np.shape[0] / self.RATE
92
+ self.timestamp_offset = self.frames_offset + duration - 5
93
+
94
+ def get_audio_chunk_for_processing(self):
95
+ """
96
+ Retrieves the next chunk of audio data for processing based on the current offsets.
97
+
98
+ Calculates which part of the audio data should be processed next, based on
99
+ the difference between the current timestamp offset and the frame's offset, scaled by
100
+ the audio sample rate (RATE). It then returns this chunk of audio data along with its
101
+ duration in seconds.
102
+
103
+ Returns:
104
+ tuple: A tuple containing:
105
+ - input_bytes (np.ndarray): The next chunk of audio data to be processed.
106
+ - duration (float): The duration of the audio chunk in seconds.
107
+ """
108
+ with self.lock:
109
+ samples_take = max(0, (self.timestamp_offset - self.frames_offset) * self.RATE)
110
+ input_bytes = self.frames_np[int(samples_take):].copy()
111
+ duration = input_bytes.shape[0] / self.RATE
112
+ return input_bytes, duration
113
+
114
+ def prepare_segments(self, last_segment=None):
115
+ """
116
+ Prepares the segments of transcribed text to be sent to the client.
117
+
118
+ This method compiles the recent segments of transcribed text, ensuring that only the
119
+ specified number of the most recent segments are included. It also appends the most
120
+ recent segment of text if provided (which is considered incomplete because of the possibility
121
+ of the last word being truncated in the audio chunk).
122
+
123
+ Args:
124
+ last_segment (str, optional): The most recent segment of transcribed text to be added
125
+ to the list of segments. Defaults to None.
126
+
127
+ Returns:
128
+ list: A list of transcribed text segments to be sent to the client.
129
+ """
130
+ segments = []
131
+ if len(self.transcript) >= self.send_last_n_segments:
132
+ segments = self.transcript[-self.send_last_n_segments:].copy()
133
+ else:
134
+ segments = self.transcript.copy()
135
+ if last_segment is not None:
136
+ segments = segments + [last_segment]
137
+ logging.info(f"{segments}")
138
+ return segments
139
+
140
+ def get_audio_chunk_duration(self, input_bytes):
141
+ """
142
+ Calculates the duration of the provided audio chunk.
143
+
144
+ Args:
145
+ input_bytes (numpy.ndarray): The audio chunk for which to calculate the duration.
146
+
147
+ Returns:
148
+ float: The duration of the audio chunk in seconds.
149
+ """
150
+ return input_bytes.shape[0] / self.RATE
151
+
152
+ def send_transcription_to_client(self, segments):
153
+ """
154
+ Sends the specified transcription segments to the client over the websocket connection.
155
+
156
+ This method formats the transcription segments into a JSON object and attempts to send
157
+ this object to the client. If an error occurs during the send operation, it logs the error.
158
+
159
+ Returns:
160
+ segments (list): A list of transcription segments to be sent to the client.
161
+ """
162
+ try:
163
+ self.websocket.send(
164
+ json.dumps({
165
+ "uid": self.client_uid,
166
+ "segments": segments,
167
+ })
168
+ )
169
+ except Exception as e:
170
+ logging.error(f"[ERROR]: Sending data to client: {e}")
171
+
172
+ def disconnect(self):
173
+ """
174
+ Notify the client of disconnection and send a disconnect message.
175
+
176
+ This method sends a disconnect message to the client via the WebSocket connection to notify them
177
+ that the transcription service is disconnecting gracefully.
178
+
179
+ """
180
+ self.websocket.send(json.dumps({
181
+ "uid": self.client_uid,
182
+ "message": self.DISCONNECT
183
+ }))
184
+
185
+ def cleanup(self):
186
+ """
187
+ Perform cleanup tasks before exiting the transcription service.
188
+
189
+ This method performs necessary cleanup tasks, including stopping the transcription thread, marking
190
+ the exit flag to indicate the transcription thread should exit gracefully, and destroying resources
191
+ associated with the transcription process.
192
+
193
+ """
194
+ logging.info("Cleaning up.")
195
+ self.exit = True
196
+
197
+
198
+ class ServeClientWhisperCPP(ServeClientBase):
199
+ SINGLE_MODEL = None
200
+ SINGLE_MODEL_LOCK = threading.Lock()
201
+
202
+ def __init__(self, websocket, language=None, client_uid=None,
203
+ single_model=False):
204
+ """
205
+ Initialize a ServeClient instance.
206
+ The Whisper model is initialized based on the client's language and device availability.
207
+ The transcription thread is started upon initialization. A "SERVER_READY" message is sent
208
+ to the client to indicate that the server is ready.
209
+
210
+ Args:
211
+ websocket (WebSocket): The WebSocket connection for the client.
212
+ language (str, optional): The language for transcription. Defaults to None.
213
+ client_uid (str, optional): A unique identifier for the client. Defaults to None.
214
+ single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
215
+
216
+ """
217
+ super().__init__(client_uid, websocket)
218
+ self.language = language
219
+ self.eos = False
220
+
221
+ if single_model:
222
+ if ServeClientWhisperCPP.SINGLE_MODEL is None:
223
+ self.create_model()
224
+ ServeClientWhisperCPP.SINGLE_MODEL = self.transcriber
225
+ else:
226
+ self.transcriber = ServeClientWhisperCPP.SINGLE_MODEL
227
+ else:
228
+ self.create_model()
229
+
230
+ # threading
231
+ logging.info('Create a thread to process audio.')
232
+ self.trans_thread = threading.Thread(target=self.speech_to_text)
233
+ self.trans_thread.start()
234
+
235
+ self.websocket.send(json.dumps({
236
+ "uid": self.client_uid,
237
+ "message": self.SERVER_READY,
238
+ "backend": "pywhispercpp"
239
+ }))
240
+
241
+ def create_model(self, warmup=True):
242
+ """
243
+ Instantiates a new model, sets it as the transcriber and does warmup if desired.
244
+ """
245
+
246
+ self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
247
+ if warmup:
248
+ self.warmup()
249
+
250
+ def warmup(self, warmup_steps=1):
251
+ """
252
+ Warmup TensorRT since first few inferences are slow.
253
+
254
+ Args:
255
+ warmup_steps (int): Number of steps to warm up the model for.
256
+ """
257
+ logging.info("[INFO:] Warming up whisper.cpp engine..")
258
+ mel, _, = soundfile.read("assets/jfk.flac")
259
+ for i in range(warmup_steps):
260
+ self.transcriber.transcribe(mel, print_progress=False)
261
+
262
+ def set_eos(self, eos):
263
+ """
264
+ Sets the End of Speech (EOS) flag.
265
+
266
+ Args:
267
+ eos (bool): The value to set for the EOS flag.
268
+ """
269
+ self.lock.acquire()
270
+ self.eos = eos
271
+ self.lock.release()
272
+
273
+ def handle_transcription_output(self, last_segment, duration):
274
+ """
275
+ Handle the transcription output, updating the transcript and sending data to the client.
276
+
277
+ Args:
278
+ last_segment (str): The last segment from the whisper output which is considered to be incomplete because
279
+ of the possibility of word being truncated.
280
+ duration (float): Duration of the transcribed audio chunk.
281
+ """
282
+ segments = self.prepare_segments({"text": last_segment})
283
+ self.send_transcription_to_client(segments)
284
+ if self.eos:
285
+ self.update_timestamp_offset(last_segment, duration)
286
+
287
+ def transcribe_audio(self, input_bytes):
288
+ """
289
+ Transcribe the audio chunk and send the results to the client.
290
+
291
+ Args:
292
+ input_bytes (np.array): The audio chunk to transcribe.
293
+ """
294
+ if ServeClientWhisperCPP.SINGLE_MODEL:
295
+ ServeClientWhisperCPP.SINGLE_MODEL_LOCK.acquire()
296
+ logging.info(f"[pywhispercpp:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}")
297
+ mel = input_bytes
298
+ duration = librosa.get_duration(y=input_bytes, sr=self.RATE)
299
+
300
+ if self.language == "zh":
301
+ prompt = '以下是简体中文普通话的句子。'
302
+ else:
303
+ prompt = 'The following is an English sentence.'
304
+
305
+ segments = self.transcriber.transcribe(
306
+ mel,
307
+ language=self.language,
308
+ initial_prompt=prompt,
309
+ token_timestamps=True,
310
+ # max_len=max_len,
311
+ print_progress=False
312
+ )
313
+ text = []
314
+ for segment in segments:
315
+ content = segment.text
316
+ text.append(content)
317
+ last_segment = ' '.join(text)
318
+
319
+ logging.info(f"[pywhispercpp:] Last segment: {last_segment}")
320
+
321
+ if ServeClientWhisperCPP.SINGLE_MODEL:
322
+ ServeClientWhisperCPP.SINGLE_MODEL_LOCK.release()
323
+ if last_segment:
324
+ self.handle_transcription_output(last_segment, duration)
325
+
326
+ def update_timestamp_offset(self, last_segment, duration):
327
+ """
328
+ Update timestamp offset and transcript.
329
+
330
+ Args:
331
+ last_segment (str): Last transcribed audio from the whisper model.
332
+ duration (float): Duration of the last audio chunk.
333
+ """
334
+ if not len(self.transcript):
335
+ self.transcript.append({"text": last_segment + " "})
336
+ elif self.transcript[-1]["text"].strip() != last_segment:
337
+ self.transcript.append({"text": last_segment + " "})
338
+
339
+ logging.info(f'Transcript list context: {self.transcript}')
340
+
341
+ with self.lock:
342
+ self.timestamp_offset += duration
343
+
344
+ def speech_to_text(self):
345
+ """
346
+ Process an audio stream in an infinite loop, continuously transcribing the speech.
347
+
348
+ This method continuously receives audio frames, performs real-time transcription, and sends
349
+ transcribed segments to the client via a WebSocket connection.
350
+
351
+ If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
352
+ It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
353
+ are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
354
+ (no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
355
+ there is no speech for a specified duration to indicate a pause.
356
+
357
+ Raises:
358
+ Exception: If there is an issue with audio processing or WebSocket communication.
359
+
360
+ """
361
+ while True:
362
+ if self.exit:
363
+ logging.info("Exiting speech to text thread")
364
+ break
365
+
366
+ if self.frames_np is None:
367
+ time.sleep(0.02) # wait for any audio to arrive
368
+ continue
369
+
370
+ self.clip_audio_if_no_valid_segment()
371
+
372
+ input_bytes, duration = self.get_audio_chunk_for_processing()
373
+ if duration < 1:
374
+ continue
375
+
376
+ try:
377
+ input_sample = input_bytes.copy()
378
+ logging.info(f"[pywhispercpp:] Processing audio with duration: {duration}")
379
+ self.transcribe_audio(input_sample)
380
+
381
+ except Exception as e:
382
+ logging.error(f"[ERROR]: {e}")
transcribe/server/__init__.py DELETED
@@ -1,2 +0,0 @@
1
-
2
- from .transcription import TranscriptionServer
 
 
 
transcribe/{server/transcription.py → transcription.py} RENAMED
@@ -7,9 +7,9 @@ import time
7
  from enum import Enum
8
  from typing import List, Optional
9
  import numpy as np
10
- from .base import ServeClientBase, ServeClientWhisperCPP
11
- from .whispercpp import PyWhiperCppServe
12
- from ..vad import VoiceActivityDetector
13
  from websockets.exceptions import ConnectionClosed
14
  from websockets.sync.server import serve
15
 
 
7
  from enum import Enum
8
  from typing import List, Optional
9
  import numpy as np
10
+ from .server import ServeClientBase
11
+ from .whisper_llm_serve import PyWhiperCppServe
12
+ from .vad import VoiceActivityDetector
13
  from websockets.exceptions import ConnectionClosed
14
  from websockets.sync.server import serve
15
 
transcribe/translate.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from logging import getLogger
2
+ from llama_cpp import Llama
3
+ import time
4
+
5
+ logger = getLogger(__name__)
6
+
7
+ class QwenTranslator:
8
+ def __init__(self, model_path, system_prompt="") -> None:
9
+ self.llm = Llama(
10
+ model_path=model_path,
11
+ # n_gpu_layers=-1, # Uncomment to use GPU acceleration
12
+ # seed=1337, # Uncomment to set a specific seed
13
+ # n_ctx=2048, # Uncomment to increase the context window
14
+ chat_format="chatml"
15
+ )
16
+ self.sys_prompt = system_prompt
17
+
18
+ def to_message(self, prompt, src_lang, dst_lang):
19
+ """构造提示词"""
20
+ return [
21
+ {"role": "system", "content": self.sys_prompt.format(src_lang=src_lang, dst_lang=dst_lang)},
22
+ {"role": "user", "content": prompt},
23
+ ]
24
+
25
+
26
+ def translate(self, prompt, src_lang, dst_lang) -> str:
27
+ message = self.to_message(prompt, src_lang, dst_lang)
28
+ start_time = time.monotonic()
29
+ output = self.llm.create_chat_completion(messages=message, temperature=0.9)
30
+ logger.info(f"LLM translate cose: {time.monotonic() - start_time}s.")
31
+ return output['choices'][0]['content']
32
+
33
+ def __call__(self, prompt, max_tokens=256,*args, **kwargs):
34
+ return self.llm(
35
+ prompt,
36
+ *args,
37
+ max_tokens=max_tokens,
38
+ **kwargs
39
+ )
transcribe/{server/whispercpp.py → whisper_llm_serve.py} RENAMED
@@ -1,5 +1,5 @@
1
 
2
- from .base import ServeClientBase
3
  from pywhispercpp.model import Model
4
  import soundfile
5
  from concurrent.futures import ProcessPoolExecutor as Pool
 
1
 
2
+ from .server import ServeClientBase
3
  from pywhispercpp.model import Model
4
  import soundfile
5
  from concurrent.futures import ProcessPoolExecutor as Pool
transcribe/{server/base.py → whispercpp_serve.py} RENAMED
File without changes