Spaces:
Running
on
Zero
Running
on
Zero
Update whisper.py
Browse files- whisper.py +4 -16
whisper.py
CHANGED
@@ -29,19 +29,8 @@ pipe = pipeline(
|
|
29 |
token=os.getenv("HF_TOKEN")
|
30 |
)
|
31 |
|
32 |
-
def post_process_transcription(
|
33 |
-
|
34 |
-
|
35 |
-
seen = set()
|
36 |
-
unique_segments = []
|
37 |
-
for segment in segments:
|
38 |
-
if segment not in seen:
|
39 |
-
unique_segments.append(segment)
|
40 |
-
seen.add(segment)
|
41 |
-
|
42 |
-
final_string = ''.join(unique_segments)
|
43 |
-
|
44 |
-
tokens = re.findall(r'\b\w+\b[.,!?]?', final_string)
|
45 |
|
46 |
cleaned_tokens = []
|
47 |
repetition_count = 0
|
@@ -65,6 +54,7 @@ def post_process_transcription(example_transcription, max_repeats=1):
|
|
65 |
|
66 |
return cleaned_transcription
|
67 |
|
|
|
68 |
def convert_forced_to_tokens(forced_decoder_ids):
|
69 |
forced_decoder_tokens = []
|
70 |
for i, (idx, token) in enumerate(forced_decoder_ids):
|
@@ -236,6 +226,4 @@ def generate(audio_path, use_v5):
|
|
236 |
|
237 |
clean_output = post_process_transcription(output, max_repeats=1)
|
238 |
|
239 |
-
return clean_output
|
240 |
-
|
241 |
-
|
|
|
29 |
token=os.getenv("HF_TOKEN")
|
30 |
)
|
31 |
|
32 |
+
def post_process_transcription(transcription, max_repeats=2):
|
33 |
+
tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
cleaned_tokens = []
|
36 |
repetition_count = 0
|
|
|
54 |
|
55 |
return cleaned_transcription
|
56 |
|
57 |
+
|
58 |
def convert_forced_to_tokens(forced_decoder_ids):
|
59 |
forced_decoder_tokens = []
|
60 |
for i, (idx, token) in enumerate(forced_decoder_ids):
|
|
|
226 |
|
227 |
clean_output = post_process_transcription(output, max_repeats=1)
|
228 |
|
229 |
+
return clean_output
|
|
|
|