daihui.zhang commited on
Commit
f13dceb
·
1 Parent(s): 418e265

add translate from 7b model when partial==False

Browse files
transcribe/translatepipes.py CHANGED
@@ -1,4 +1,4 @@
1
- from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
2
 
3
 
4
  class TranslatePipes:
@@ -11,7 +11,7 @@ class TranslatePipes:
11
  self._funasr_pipe = self._launch_process(FunASRPipe())
12
 
13
  # llm 翻译
14
- # self._translate_pipe = self._launch_process(TranslatePipe())
15
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
16
  # vad
17
  self._vad_pipe = self._launch_process(VadPipe())
 
1
+ from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe, TranslatePipe
2
 
3
 
4
  class TranslatePipes:
 
11
  self._funasr_pipe = self._launch_process(FunASRPipe())
12
 
13
  # llm 翻译
14
+ self._translate_pipe = self._launch_process(TranslatePipe())
15
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
16
  # vad
17
  self._vad_pipe = self._launch_process(VadPipe())
transcribe/whisper_llm_serve.py CHANGED
@@ -68,7 +68,7 @@ class WhisperTranscriptionService:
68
  self._frame_processing_thread_stop = threading.Event()
69
 
70
  self.translate_thread = _start_thread(self._transcription_processing_loop)
71
- self.frame_processing_thread = _start_thread(self._frame_processing_loop)
72
  self.row_number = 0
73
  # for test
74
  self._transcribe_time_cost = 0.
@@ -98,7 +98,7 @@ class WhisperTranscriptionService:
98
  return speech_audio, speech_status
99
 
100
 
101
- def _frame_processing_loop(self) -> None:
102
  """从队列获取音频帧并合并到缓冲区"""
103
  while not self._frame_processing_thread_stop.is_set():
104
  try:
@@ -109,17 +109,21 @@ class WhisperTranscriptionService:
109
  continue
110
 
111
  with self.lock:
 
 
 
112
  if speech_status == "START" and self.frames_np_start_timestamp is None:
113
  self.frames_np_start_timestamp = time.time()
114
- # 添加音频到音频缓冲区
115
- self.frames_np = np.append(self.frames_np, frame_np)
116
  if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
117
  audio_array=self.frames_np.copy()
118
  self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
119
  self.frames_np_start_timestamp = time.time()
120
  self.frames_np = np.array([], dtype=np.float32)
121
-
122
- elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
 
123
  time_diff = time.time() - self.frames_np_start_timestamp
124
  if time_diff >= config.FRAME_SCOPE_TIME_THRESHOLD:
125
  audio_array=self.frames_np.copy()
@@ -127,7 +131,7 @@ class WhisperTranscriptionService:
127
  self.frames_np_start_timestamp = None
128
  self.frames_np = np.array([], dtype=np.float32)
129
  else:
130
- logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续增加缓冲区")
131
 
132
  except queue.Empty:
133
  pass
@@ -139,7 +143,7 @@ class WhisperTranscriptionService:
139
  while not self._translate_thread_stop.is_set():
140
 
141
  if len(self.frames_np) ==0:
142
- time.sleep(0.01)
143
  continue
144
 
145
  with self.lock:
@@ -163,21 +167,24 @@ class WhisperTranscriptionService:
163
 
164
  if len(segments):
165
  seg_text = self.text_separator.join(seg.text for seg in segments)
 
 
 
 
 
 
 
 
 
166
  result = TransResult(
167
  seg_id=self.row_number,
168
  context=seg_text,
169
  from_=self.source_language,
170
  to=self.target_language,
171
- tran_content=self._translate_text_large(seg_text),
172
  partial=partial
173
  )
174
  self._send_result_to_client(result)
175
- if not partial:
176
- self.row_number += 1
177
- frame_epoch = 1
178
- else:
179
- frame_epoch += 1
180
-
181
 
182
 
183
 
@@ -221,6 +228,7 @@ class WhisperTranscriptionService:
221
 
222
  result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
223
  translated_text = result.translate_content
 
224
  time_diff = (time.perf_counter() - start_time)
225
  log_block("Translation large model time ", f"{time_diff:.3f}", "s")
226
  log_block("Translation large model output", f"{translated_text}")
 
68
  self._frame_processing_thread_stop = threading.Event()
69
 
70
  self.translate_thread = _start_thread(self._transcription_processing_loop)
71
+ self.frame_processing_thread = _start_thread(self._read_frame_processing_loop)
72
  self.row_number = 0
73
  # for test
74
  self._transcribe_time_cost = 0.
 
98
  return speech_audio, speech_status
99
 
100
 
101
+ def _read_frame_processing_loop(self) -> None:
102
  """从队列获取音频帧并合并到缓冲区"""
103
  while not self._frame_processing_thread_stop.is_set():
104
  try:
 
109
  continue
110
 
111
  with self.lock:
112
+ self.frames_np = np.append(self.frames_np, frame_np)
113
+
114
+ # 音频开始时间节点 用来统计时间来 达到最小断句时间长度
115
  if speech_status == "START" and self.frames_np_start_timestamp is None:
116
  self.frames_np_start_timestamp = time.time()
117
+
118
+ # 音频最长时间缓冲区限制,超过了就强制断句
119
  if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
120
  audio_array=self.frames_np.copy()
121
  self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
122
  self.frames_np_start_timestamp = time.time()
123
  self.frames_np = np.array([], dtype=np.float32)
124
+
125
+ # 音频结束信号的时候 整合当前缓冲区
126
+ elif speech_status == "END" and len(self.frames_np) > 0:
127
  time_diff = time.time() - self.frames_np_start_timestamp
128
  if time_diff >= config.FRAME_SCOPE_TIME_THRESHOLD:
129
  audio_array=self.frames_np.copy()
 
131
  self.frames_np_start_timestamp = None
132
  self.frames_np = np.array([], dtype=np.float32)
133
  else:
134
+ logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续保留在缓冲区")
135
 
136
  except queue.Empty:
137
  pass
 
143
  while not self._translate_thread_stop.is_set():
144
 
145
  if len(self.frames_np) ==0:
146
+ time.sleep(0.1)
147
  continue
148
 
149
  with self.lock:
 
167
 
168
  if len(segments):
169
  seg_text = self.text_separator.join(seg.text for seg in segments)
170
+ # 整行
171
+ if not partial:
172
+ translated_content = self._translate_text(seg_text)
173
+ self.row_number += 1
174
+ frame_epoch = 1
175
+ else:
176
+ translated_content = self._translate_text_large(seg_text)
177
+ frame_epoch += 1
178
+
179
  result = TransResult(
180
  seg_id=self.row_number,
181
  context=seg_text,
182
  from_=self.source_language,
183
  to=self.target_language,
184
+ tran_content=translated_content,
185
  partial=partial
186
  )
187
  self._send_result_to_client(result)
 
 
 
 
 
 
188
 
189
 
190
 
 
228
 
229
  result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
230
  translated_text = result.translate_content
231
+
232
  time_diff = (time.perf_counter() - start_time)
233
  log_block("Translation large model time ", f"{time_diff:.3f}", "s")
234
  log_block("Translation large model output", f"{translated_text}")