Merge branch 'vad'
Browse files* vad:
add words ignore
update vad min_silence_duration_ms to 100
update prompt for en
update prompt for en
fix bug of queue lock
- config/keywords.txt +1 -5
- config/prompt.py +9 -15
- transcribe/pipelines/pipe_vad.py +1 -1
- transcribe/serve.py +54 -44
config/keywords.txt
CHANGED
@@ -1,8 +1,4 @@
|
|
1 |
OpenAGI
|
2 |
-
GOSIM
|
3 |
-
Rust
|
4 |
LLaMA Factory
|
5 |
OPENGL
|
6 |
-
|
7 |
-
Web3
|
8 |
-
DeepSeek
|
|
|
1 |
OpenAGI
|
|
|
|
|
2 |
LLaMA Factory
|
3 |
OPENGL
|
4 |
+
Web3
|
|
|
|
config/prompt.py
CHANGED
@@ -18,33 +18,27 @@ hotwords_json = json.loads((CONFIG_DIR / 'hotwords.json').read_text())
|
|
18 |
keywords_list = [i.strip() for i in (CONFIG_DIR / 'keywords.txt').read_text().split('\n') if i.strip()]
|
19 |
keywords_mapping_string = '\n'.join([f' * {value}: {value}' for value in keywords_list ])
|
20 |
|
21 |
-
LLM_SYS_7B_PROMPT_EN
|
22 |
你是一名专业的同声传译员,正在为 GOSIM 会议提供中英/英中翻译服务。你的任务是准确、流畅地翻译发言内容。
|
23 |
|
24 |
请遵循以下要求:
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
2. **专业术语:** 必须准确保留或翻译计算机相关的专业术语和技术词汇**请优先参考下方提供的术语对照表进行翻译。** 对于对照表中未包含的术语,如果该术语有公认的标准翻译,请使用标准翻译;如果没有或不确定,可以保留英文原文或提供最贴切的翻译。不要用通俗词汇替代专业术语。
|
28 |
-
3. **专有名词:** 对于专有名词,如会议名称 "GOSIM"、人名、公司名、项目名、特定技术名称等,请保留其原始英文不做翻译。优先保持一致性和清晰度。
|
29 |
-
4. **流畅性与准确性:** 在追求口语化的同时,务必保证信息传达的准确性。
|
30 |
-
5. **输出:** 请直接输出翻译结果,不要添加任何额外的解释或说明。
|
31 |
-
|
32 |
-
**专业术语对照表(请优先使用此表中的翻译):**
|
33 |
-
* Simulation: 仿真
|
34 |
-
* Modeling: 建模
|
35 |
* driver: 驱动
|
36 |
* bus: 总线
|
37 |
* mask: 掩码
|
38 |
* preemption: 抢占
|
39 |
* register: 寄存器
|
40 |
-
*
|
|
|
41 |
{keywords_mapping_string}
|
42 |
|
43 |
现在,请将以下内容翻译成中文:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
""".format(keywords_mapping_string=keywords_mapping_string)
|
49 |
|
50 |
LLM_SYS_7B_PROMPT_ZH = """
|
|
|
18 |
keywords_list = [i.strip() for i in (CONFIG_DIR / 'keywords.txt').read_text().split('\n') if i.strip()]
|
19 |
keywords_mapping_string = '\n'.join([f' * {value}: {value}' for value in keywords_list ])
|
20 |
|
21 |
+
LLM_SYS_7B_PROMPT_EN= """
|
22 |
你是一名专业的同声传译员,正在为 GOSIM 会议提供中英/英中翻译服务。你的任务是准确、流畅地翻译发言内容。
|
23 |
|
24 |
请遵循以下要求:
|
25 |
+
1. 语言风格:翻译成中文时,请使用自然、流畅、符合现代汉语口语习惯的表达方式。避免生硬、逐字翻译的痕迹,要让听众容易理解。
|
26 |
+
2. 专业术语:**请优先参考下方提供的术语对照表进行翻译。** 对于对照表中未包含的术语,如果该术语有公认的标准翻译,请使用标准翻译;如果没有或不确定,可以保留英文原文或提供最贴切的翻译。不要用通俗词汇替代专业术语。
|
27 |
+
3. 专有名词:对于专有名词,如会议名称 "GOSIM"、人名、公司名、项目名、特定技术名称等,请保留其原始英文不做翻译。
|
28 |
+
4. 流畅性与准确性:在追求口语化的同时,务必保证信息传达的准确性。
|
29 |
+
5. 输出:请直接输出翻译结果,不要添加任何额外的解释或说明。
|
30 |
|
31 |
+
**专业术语对照表:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
* driver: 驱动
|
33 |
* bus: 总线
|
34 |
* mask: 掩码
|
35 |
* preemption: 抢占
|
36 |
* register: 寄存器
|
37 |
+
* Library: 库
|
38 |
+
* biases: 偏移
|
39 |
{keywords_mapping_string}
|
40 |
|
41 |
现在,请将以下内容翻译成中文:
|
|
|
|
|
|
|
|
|
42 |
""".format(keywords_mapping_string=keywords_mapping_string)
|
43 |
|
44 |
LLM_SYS_7B_PROMPT_ZH = """
|
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -31,7 +31,7 @@ class VadPipe(BasePipe):
|
|
31 |
threshold=0.6,
|
32 |
sampling_rate=cls.sample_rate,
|
33 |
# speech_pad_ms=10
|
34 |
-
min_silence_duration_ms =
|
35 |
# speech_pad_ms = 30,
|
36 |
)
|
37 |
cls.vac.reset_states()
|
|
|
31 |
threshold=0.6,
|
32 |
sampling_rate=cls.sample_rate,
|
33 |
# speech_pad_ms=10
|
34 |
+
min_silence_duration_ms = 100,
|
35 |
# speech_pad_ms = 30,
|
36 |
)
|
37 |
cls.vac.reset_states()
|
transcribe/serve.py
CHANGED
@@ -37,7 +37,8 @@ class WhisperTranscriptionService:
|
|
37 |
# 音频处理相关
|
38 |
self.sample_rate = config.SAMPLE_RATE
|
39 |
|
40 |
-
self.
|
|
|
41 |
# 文本分隔符,根据语言设置
|
42 |
self.text_separator = get_text_separator(language)
|
43 |
self.loop = asyncio.get_event_loop()
|
@@ -72,61 +73,70 @@ class WhisperTranscriptionService:
|
|
72 |
def _read_frame_processing_loop(self) -> None:
|
73 |
"""从队列获取音频帧并合并到缓冲区"""
|
74 |
while not self._stop.is_set():
|
75 |
-
|
76 |
-
|
77 |
-
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
# logger.critical(f"frame np:{frame_np.shape}, {speech_status}")
|
82 |
-
with self.lock:
|
83 |
-
self.frames_np = np.append(self.frames_np, frame_np)
|
84 |
-
|
85 |
-
# 音频开始时间节点 用来统计时间来 达到最小断句时间长度
|
86 |
-
if speech_status == "START" and self.frames_np_start_timestamp is None:
|
87 |
-
self.frames_np_start_timestamp = time.time()
|
88 |
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
audio_array=self.frames_np.copy()
|
92 |
-
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
93 |
-
self.frames_np_start_timestamp = time.time()
|
94 |
self.frames_np = np.array([], dtype=np.float32)
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
self.frames_np_start_timestamp = None
|
105 |
-
self.frames_np = np.array([], dtype=np.float32)
|
106 |
-
else:
|
107 |
-
logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续保留在缓冲区")
|
108 |
-
|
109 |
-
except queue.Empty:
|
110 |
-
pass
|
111 |
|
112 |
def _transcription_processing_loop(self) -> None:
|
113 |
"""主转录处理循环"""
|
114 |
frame_epoch = 1
|
115 |
|
116 |
while not self._stop.is_set():
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
127 |
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
128 |
partial = True
|
129 |
|
|
|
|
|
|
|
130 |
if len(audio_buffer) < int(self.sample_rate):
|
131 |
# Add a small buffer (e.g., 10ms worth of samples) to be safe
|
132 |
padding_samples = int(self.sample_rate * 0.01) # e.g., 160 samples for 10ms at 16kHz
|
@@ -137,7 +147,7 @@ class WhisperTranscriptionService:
|
|
137 |
silence_audio[-copy_length:] = audio_buffer[-copy_length:] # Copy from the end of audio_buffer
|
138 |
audio_buffer = silence_audio
|
139 |
|
140 |
-
|
141 |
meta_item = self._transcribe_audio(audio_buffer)
|
142 |
segments = meta_item.segments
|
143 |
logger.debug(f"Segments: {segments}")
|
@@ -145,7 +155,7 @@ class WhisperTranscriptionService:
|
|
145 |
|
146 |
if len(segments):
|
147 |
seg_text = self.text_separator.join(seg.text for seg in segments)
|
148 |
-
if
|
149 |
continue
|
150 |
# 整行
|
151 |
if not partial:
|
|
|
37 |
# 音频处理相关
|
38 |
self.sample_rate = config.SAMPLE_RATE
|
39 |
|
40 |
+
self.frame_lock = threading.Lock()
|
41 |
+
self.segment_lock = threading.Lock()
|
42 |
# 文本分隔符,根据语言设置
|
43 |
self.text_separator = get_text_separator(language)
|
44 |
self.loop = asyncio.get_event_loop()
|
|
|
73 |
def _read_frame_processing_loop(self) -> None:
|
74 |
"""从队列获取音频帧并合并到缓冲区"""
|
75 |
while not self._stop.is_set():
|
76 |
+
frame_np = self.frame_queue.get()
|
77 |
+
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
|
|
78 |
|
79 |
+
if frame_np is None:
|
80 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
with self.frame_lock:
|
83 |
+
self.frames_np = np.append(self.frames_np, frame_np)
|
84 |
+
|
85 |
+
# 音频开始时间节点 用来统计时间来 达到最小断句时间长度
|
86 |
+
if speech_status == "START" and self.frames_np_start_timestamp is None:
|
87 |
+
self.frames_np_start_timestamp = time.time()
|
88 |
+
|
89 |
+
# 音频最长时间缓冲区限制,超过了就强制断句
|
90 |
+
if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
|
91 |
+
audio_array=self.frames_np.copy()
|
92 |
+
with self.segment_lock:
|
93 |
+
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
94 |
+
self.frames_np_start_timestamp = time.time()
|
95 |
+
|
96 |
+
with self.frame_lock:
|
97 |
+
self.frames_np = np.array([], dtype=np.float32)
|
98 |
+
|
99 |
+
# 音频结束信号的时候 整合当前缓冲区
|
100 |
+
# START -- END -- START -- END 通常
|
101 |
+
# START -- END -- END end块带有音频信息的通常是4096内断的一个短音
|
102 |
+
if speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
|
103 |
+
time_diff = time.time() - self.frames_np_start_timestamp
|
104 |
+
if time_diff >= config.FRAME_SCOPE_TIME_THRESHOLD:
|
105 |
+
with self.frame_lock:
|
106 |
audio_array=self.frames_np.copy()
|
|
|
|
|
107 |
self.frames_np = np.array([], dtype=np.float32)
|
108 |
|
109 |
+
with self.segment_lock:
|
110 |
+
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
111 |
+
logger.debug(f"🥳 增加整句到队列")
|
112 |
+
self.frames_np_start_timestamp = None
|
113 |
+
|
114 |
+
else:
|
115 |
+
logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续保留在缓冲区")
|
116 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
def _transcription_processing_loop(self) -> None:
|
119 |
"""主转录处理循环"""
|
120 |
frame_epoch = 1
|
121 |
|
122 |
while not self._stop.is_set():
|
123 |
+
time.sleep(0.1)
|
124 |
+
|
125 |
+
with self.segment_lock:
|
126 |
+
segment_length = len(self.full_segments_queue)
|
127 |
+
if segment_length > 0:
|
128 |
+
audio_buffer = self.full_segments_queue.pop()
|
129 |
+
partial = False
|
130 |
+
else:
|
131 |
+
with self.frame_lock:
|
132 |
+
if len(self.frames_np) ==0:
|
133 |
+
continue
|
134 |
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
135 |
partial = True
|
136 |
|
137 |
+
logger.debug(f"full_segments_queue size: {segment_length}")
|
138 |
+
logger.debug(f"audio buffer size: {len(self.frames_np) / self.sample_rate:.2f}s")
|
139 |
+
|
140 |
if len(audio_buffer) < int(self.sample_rate):
|
141 |
# Add a small buffer (e.g., 10ms worth of samples) to be safe
|
142 |
padding_samples = int(self.sample_rate * 0.01) # e.g., 160 samples for 10ms at 16kHz
|
|
|
147 |
silence_audio[-copy_length:] = audio_buffer[-copy_length:] # Copy from the end of audio_buffer
|
148 |
audio_buffer = silence_audio
|
149 |
|
150 |
+
|
151 |
meta_item = self._transcribe_audio(audio_buffer)
|
152 |
segments = meta_item.segments
|
153 |
logger.debug(f"Segments: {segments}")
|
|
|
155 |
|
156 |
if len(segments):
|
157 |
seg_text = self.text_separator.join(seg.text for seg in segments)
|
158 |
+
if seg_text.strip() in ['', '.', '-']: # 过滤空字符
|
159 |
continue
|
160 |
# 整行
|
161 |
if not partial:
|