daihui.zhang commited on
Commit
5518c26
·
1 Parent(s): 80839d5

filter [] words

Browse files
transcribe/helpers/vadprocessor.py CHANGED
@@ -113,6 +113,7 @@ class VADIteratorOnnx:
113
  sampling_rate: int = 16000,
114
  min_silence_duration_ms: int = 100,
115
  max_speech_duration_s: float = float('inf'),
 
116
  ):
117
  self.model = OnnxWrapper(VAD_MODEL_PATH, True)
118
  self.threshold = threshold
@@ -123,7 +124,7 @@ class VADIteratorOnnx:
123
 
124
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
125
  self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
126
- # self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
127
  self.reset_states()
128
 
129
  def reset_states(self):
@@ -158,7 +159,8 @@ class VADIteratorOnnx:
158
 
159
  if (speech_prob >= self.threshold) and not self.triggered:
160
  self.triggered = True
161
- speech_start = max(0, self.current_sample - window_size_samples)
 
162
  self.start = speech_start
163
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
164
 
@@ -174,7 +176,8 @@ class VADIteratorOnnx:
174
  if self.current_sample - self.temp_end < self.min_silence_samples:
175
  return None
176
  else:
177
- speech_end = self.temp_end - window_size_samples
 
178
  self.temp_end = 0
179
  self.triggered = False
180
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
 
113
  sampling_rate: int = 16000,
114
  min_silence_duration_ms: int = 100,
115
  max_speech_duration_s: float = float('inf'),
116
+ speech_pad_ms: int = 30
117
  ):
118
  self.model = OnnxWrapper(VAD_MODEL_PATH, True)
119
  self.threshold = threshold
 
124
 
125
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
126
  self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
127
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
128
  self.reset_states()
129
 
130
  def reset_states(self):
 
159
 
160
  if (speech_prob >= self.threshold) and not self.triggered:
161
  self.triggered = True
162
+ # speech_start = max(0, self.current_sample - window_size_samples)
163
+ speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
164
  self.start = speech_start
165
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
166
 
 
176
  if self.current_sample - self.temp_end < self.min_silence_samples:
177
  return None
178
  else:
179
+ # speech_end = self.temp_end - window_size_samples
180
+ speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
181
  self.temp_end = 0
182
  self.triggered = False
183
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
transcribe/helpers/whisper.py CHANGED
@@ -52,7 +52,7 @@ class WhisperCPP:
52
  initial_prompt=prompt,
53
  language=language,
54
  # token_timestamps=True,
55
- # split_on_word=True,
56
  # max_len=max_len
57
  )
58
  return output
 
52
  initial_prompt=prompt,
53
  language=language,
54
  # token_timestamps=True,
55
+ split_on_word=True,
56
  # max_len=max_len
57
  )
58
  return output
transcribe/utils.py CHANGED
@@ -7,6 +7,51 @@ from scipy.io.wavfile import write
7
  import config
8
  import csv
9
  import av
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def log_block(key: str, value, unit=''):
11
  if config.DEBUG:
12
  return
 
7
  import config
8
  import csv
9
  import av
10
+ import re
11
+
12
+ # Compile regex patterns once outside the loop for better performance
13
+ p_pattern = re.compile(r"(\s*\[.*?\])")
14
+ p_start_pattern = re.compile(r"(\s*\[.*)")
15
+ p_end_pattern = re.compile(r"(\s*.*\])")
16
+
17
+
18
+ def filter_words(res_word):
19
+ """
20
+ Filter words according to specific bracket patterns.
21
+
22
+ Args:
23
+ res_word: Iterable of word objects with a 'text' attribute
24
+
25
+ Returns:
26
+ List of filtered word objects
27
+ """
28
+ asr_results = []
29
+ skip_word = False
30
+
31
+ for word in res_word:
32
+ # Skip words that completely match the pattern
33
+ if p_pattern.match(word.text):
34
+ continue
35
+
36
+ # Mark the start of a section to skip
37
+ if p_start_pattern.match(word.text):
38
+ skip_word = True
39
+ continue
40
+
41
+ # Mark the end of a section to skip
42
+ if p_end_pattern.match(word.text) and skip_word:
43
+ skip_word = False
44
+ continue
45
+
46
+ # Skip words if we're in a skip section
47
+ if skip_word:
48
+ continue
49
+
50
+ # Add the word to results if it passed all filters
51
+ asr_results.append(word)
52
+
53
+ return asr_results
54
+
55
  def log_block(key: str, value, unit=''):
56
  if config.DEBUG:
57
  return
transcribe/whisper_llm_serve.py CHANGED
@@ -11,7 +11,7 @@ import config
11
  import collections
12
  from api_model import TransResult, Message, DebugResult
13
 
14
- from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
@@ -191,6 +191,7 @@ class WhisperTranscriptionService:
191
  meta_item = self._transcribe_audio(audio_buffer)
192
  segments = meta_item.segments
193
  logger.debug(f"Segments: {segments}")
 
194
  if len(segments):
195
  seg_text = self.text_separator.join(seg.text for seg in segments)
196
  if self._temp_string:
 
11
  import collections
12
  from api_model import TransResult, Message, DebugResult
13
 
14
+ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
 
191
  meta_item = self._transcribe_audio(audio_buffer)
192
  segments = meta_item.segments
193
  logger.debug(f"Segments: {segments}")
194
+ segments = filter_words(segments)
195
  if len(segments):
196
  seg_text = self.text_separator.join(seg.text for seg in segments)
197
  if self._temp_string: