megatrump commited on
Commit
af35adf
·
1 Parent(s): fcdb6f3
Files changed (1) hide show
  1. api.py +4 -10
api.py CHANGED
@@ -203,29 +203,23 @@ async def process_audio(audio_data: bytes, language: str = "auto") -> str:
203
  audio_buffer = BytesIO(audio_data)
204
  waveform, sample_rate = torchaudio.load(audio_buffer)
205
 
206
- print(waveform.shape)
207
 
208
  # Convert to mono channel
209
  if waveform.shape[0] > 1:
210
  waveform = waveform.mean(dim=0)
 
211
 
212
  # Convert to numpy array and normalize
213
  input_wav = waveform.numpy().astype(np.float32)
 
214
 
215
  # Resample to 16kHz if needed
216
  if sample_rate != 16000:
217
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
218
  input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
219
 
220
-
221
- target_length = 90 * 16000
222
- current_length = input_wav.shape[1]
223
- if current_length < target_length:
224
- padding_length = target_length - current_length
225
- padding = np.zeros((1, padding_length), dtype=np.float32)
226
- print(input_wav.shape)
227
- print(padding.shape)
228
- input_wav = np.concatenate((input_wav, padding), axis=1)
229
 
230
  # Model inference
231
  text = model.generate(
 
203
  audio_buffer = BytesIO(audio_data)
204
  waveform, sample_rate = torchaudio.load(audio_buffer)
205
 
206
+ print(1, waveform.shape)
207
 
208
  # Convert to mono channel
209
  if waveform.shape[0] > 1:
210
  waveform = waveform.mean(dim=0)
211
+ print(2, waveform.shape)
212
 
213
  # Convert to numpy array and normalize
214
  input_wav = waveform.numpy().astype(np.float32)
215
+ print(3, input_wav.shape)
216
 
217
  # Resample to 16kHz if needed
218
  if sample_rate != 16000:
219
  resampler = torchaudio.transforms.Resample(sample_rate, 16000)
220
  input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
221
 
222
+ print(4, input_wav.shape)
 
 
 
 
 
 
 
 
223
 
224
  # Model inference
225
  text = model.generate(