Spaces:
Running
on
Zero
Running
on
Zero
Update whisper.py
Browse files- whisper.py +17 -2
whisper.py
CHANGED
@@ -203,18 +203,30 @@ def processing_vad_threshold(audio, output_vad, threshold, max_duration, concate
|
|
203 |
|
204 |
def format_audio(audio_path):
|
205 |
input_audio, sample_rate = torchaudio.load(audio_path)
|
|
|
|
|
|
|
|
|
206 |
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
207 |
input_audio = resampler(input_audio)
|
208 |
input_audio = input_audio.squeeze().numpy()
|
209 |
return(input_audio)
|
210 |
|
|
|
211 |
def transcribe_pipeline(audio, task):
|
212 |
text = pipe(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
213 |
return text
|
214 |
|
215 |
def generate(audio_path, use_v5):
|
216 |
audio = AudioSegment.from_wav(audio_path)
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
output_vad = pipeline_vad(audio_path)
|
219 |
concatenated_segment = AudioSegment.empty()
|
220 |
max_duration = 0
|
@@ -226,5 +238,8 @@ def generate(audio_path, use_v5):
|
|
226 |
output = transcribe_pipeline(format_audio(audio_path), task)
|
227 |
|
228 |
clean_output = post_process_transcription(output)
|
229 |
-
|
|
|
|
|
|
|
230 |
return clean_output
|
|
|
203 |
|
204 |
def format_audio(audio_path):
|
205 |
input_audio, sample_rate = torchaudio.load(audio_path)
|
206 |
+
|
207 |
+
if input_audio.shape[0] == 2: #stereo2mono
|
208 |
+
input_audio = torch.mean(input_audio, dim=0, keepdim=True)
|
209 |
+
|
210 |
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
211 |
input_audio = resampler(input_audio)
|
212 |
input_audio = input_audio.squeeze().numpy()
|
213 |
return(input_audio)
|
214 |
|
215 |
+
|
216 |
def transcribe_pipeline(audio, task):
|
217 |
text = pipe(audio, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
218 |
return text
|
219 |
|
220 |
def generate(audio_path, use_v5):
|
221 |
audio = AudioSegment.from_wav(audio_path)
|
222 |
+
|
223 |
+
temp_mono_path = None
|
224 |
+
if audio.channels != 1: #stereo2mono
|
225 |
+
audio = audio.set_channels(1)
|
226 |
+
temp_mono_path = "temp_mono.wav"
|
227 |
+
audio.export(temp_mono_path, format="wav")
|
228 |
+
audio_path = temp_mono_path
|
229 |
+
|
230 |
output_vad = pipeline_vad(audio_path)
|
231 |
concatenated_segment = AudioSegment.empty()
|
232 |
max_duration = 0
|
|
|
238 |
output = transcribe_pipeline(format_audio(audio_path), task)
|
239 |
|
240 |
clean_output = post_process_transcription(output)
|
241 |
+
|
242 |
+
if temp_mono_path and os.path.exists(temp_mono_path):
|
243 |
+
os.remove(temp_mono_path)
|
244 |
+
|
245 |
return clean_output
|