ssolito commited on
Commit
40501c8
·
verified ·
1 Parent(s): 5669eef

Update whisper.py

Browse files
Files changed (1) hide show
  1. whisper.py +10 -10
whisper.py CHANGED
@@ -204,8 +204,8 @@ def processing_vad_threshold(audio, output_vad, threshold, max_duration, concate
204
  def format_audio(audio_path):
205
  input_audio, sample_rate = torchaudio.load(audio_path)
206
 
207
- if input_audio.shape[0] == 2: #stereo2mono
208
- input_audio = torch.mean(input_audio, dim=0, keepdim=True)
209
 
210
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
211
  input_audio = resampler(input_audio)
@@ -220,12 +220,12 @@ def transcribe_pipeline(audio, task):
220
  def generate(audio_path, use_v5):
221
  audio = AudioSegment.from_wav(audio_path)
222
 
223
- temp_mono_path = None
224
- if audio.channels != 1: #stereo2mono
225
- audio = audio.set_channels(1)
226
- temp_mono_path = "temp_mono.wav"
227
- audio.export(temp_mono_path, format="wav")
228
- audio_path = temp_mono_path
229
 
230
  output_vad = pipeline_vad(audio_path)
231
  concatenated_segment = AudioSegment.empty()
@@ -239,7 +239,7 @@ def generate(audio_path, use_v5):
239
 
240
  clean_output = post_process_transcription(output)
241
 
242
- if temp_mono_path and os.path.exists(temp_mono_path):
243
- os.remove(temp_mono_path)
244
 
245
  return clean_output
 
204
  def format_audio(audio_path):
205
  input_audio, sample_rate = torchaudio.load(audio_path)
206
 
207
+ #if input_audio.shape[0] == 2: #stereo2mono
208
+ # input_audio = torch.mean(input_audio, dim=0, keepdim=True)
209
 
210
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
211
  input_audio = resampler(input_audio)
 
220
  def generate(audio_path, use_v5):
221
  audio = AudioSegment.from_wav(audio_path)
222
 
223
+ #temp_mono_path = None
224
+ #if audio.channels != 1: #stereo2mono
225
+ # audio = audio.set_channels(1)
226
+ # temp_mono_path = "temp_mono.wav"
227
+ # audio.export(temp_mono_path, format="wav")
228
+ # audio_path = temp_mono_path
229
 
230
  output_vad = pipeline_vad(audio_path)
231
  concatenated_segment = AudioSegment.empty()
 
239
 
240
  clean_output = post_process_transcription(output)
241
 
242
+ #if temp_mono_path and os.path.exists(temp_mono_path):
243
+ # os.remove(temp_mono_path)
244
 
245
  return clean_output