Spaces:
Sleeping
Sleeping
final version
Browse files
api.py
CHANGED
@@ -203,26 +203,20 @@ async def process_audio(audio_data: bytes, language: str = "auto") -> str:
|
|
203 |
audio_buffer = BytesIO(audio_data)
|
204 |
waveform, sample_rate = torchaudio.load(audio_buffer)
|
205 |
|
206 |
-
print(1, waveform.shape)
|
207 |
-
|
208 |
# Convert to mono channel
|
209 |
if waveform.shape[0] > 1:
|
210 |
waveform = waveform.mean(dim=0)
|
211 |
else:
|
212 |
waveform = np.squeeze(waveform)
|
213 |
-
print(2, waveform.shape)
|
214 |
|
215 |
# Convert to numpy array and normalize
|
216 |
input_wav = waveform.numpy().astype(np.float32)
|
217 |
-
print(3, input_wav.shape)
|
218 |
|
219 |
# Resample to 16kHz if needed
|
220 |
if sample_rate != 16000:
|
221 |
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
222 |
input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
|
223 |
|
224 |
-
print(4, input_wav.shape)
|
225 |
-
|
226 |
# Model inference
|
227 |
text = model.generate(
|
228 |
input=input_wav,
|
|
|
203 |
audio_buffer = BytesIO(audio_data)
|
204 |
waveform, sample_rate = torchaudio.load(audio_buffer)
|
205 |
|
|
|
|
|
206 |
# Convert to mono channel
|
207 |
if waveform.shape[0] > 1:
|
208 |
waveform = waveform.mean(dim=0)
|
209 |
else:
|
210 |
waveform = np.squeeze(waveform)
|
|
|
211 |
|
212 |
# Convert to numpy array and normalize
|
213 |
input_wav = waveform.numpy().astype(np.float32)
|
|
|
214 |
|
215 |
# Resample to 16kHz if needed
|
216 |
if sample_rate != 16000:
|
217 |
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
218 |
input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
|
219 |
|
|
|
|
|
220 |
# Model inference
|
221 |
text = model.generate(
|
222 |
input=input_wav,
|