ssolito commited on
Commit
0053d2c
·
verified ·
1 Parent(s): f5fe5d5

Update whisper.py

Browse files
Files changed (1) hide show
  1. whisper.py +4 -16
whisper.py CHANGED
@@ -29,19 +29,8 @@ pipe = pipeline(
29
  token=os.getenv("HF_TOKEN")
30
  )
31
 
32
- def post_process_transcription(example_transcription, max_repeats=1):
33
- segments = re.findall(r'.+?[.,?]', example_transcription)
34
-
35
- seen = set()
36
- unique_segments = []
37
- for segment in segments:
38
- if segment not in seen:
39
- unique_segments.append(segment)
40
- seen.add(segment)
41
-
42
- final_string = ''.join(unique_segments)
43
-
44
- tokens = re.findall(r'\b\w+\b[.,!?]?', final_string)
45
 
46
  cleaned_tokens = []
47
  repetition_count = 0
@@ -65,6 +54,7 @@ def post_process_transcription(example_transcription, max_repeats=1):
65
 
66
  return cleaned_transcription
67
 
 
68
  def convert_forced_to_tokens(forced_decoder_ids):
69
  forced_decoder_tokens = []
70
  for i, (idx, token) in enumerate(forced_decoder_ids):
@@ -236,6 +226,4 @@ def generate(audio_path, use_v5):
236
 
237
  clean_output = post_process_transcription(output, max_repeats=1)
238
 
239
- return clean_output
240
-
241
-
 
29
  token=os.getenv("HF_TOKEN")
30
  )
31
 
32
+ def post_process_transcription(transcription, max_repeats=2):
33
+ tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  cleaned_tokens = []
36
  repetition_count = 0
 
54
 
55
  return cleaned_transcription
56
 
57
+
58
  def convert_forced_to_tokens(forced_decoder_ids):
59
  forced_decoder_tokens = []
60
  for i, (idx, token) in enumerate(forced_decoder_ids):
 
226
 
227
  clean_output = post_process_transcription(output, max_repeats=1)
228
 
229
+ return clean_output