Spaces:

demavior
/

whisper_gradio

Sleeping

demavior commited on Oct 1, 2024

Commit

a2978c1

verified ·

1 Parent(s): aaf280a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,29 +10,6 @@ import numpy as np
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 def transcribe(audio):
-    # Extract the sample rate and audio data from the tuple
-    sample_rate, audio_data = audio
-    # Ensure the audio data is a numpy array
-    if not isinstance(audio_data, np.ndarray):
-        audio_data = np.array(audio_data)
-    # Convert to a tensor and ensure it's a floating-point type
-    audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
-    # Convert to mono if the audio is stereo
-    if audio_tensor.ndim > 1:
-        audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)
-    # Resample to 16kHz if necessary
-    if sample_rate != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-        audio_tensor = resampler(audio_tensor)
-    # Convert back to a NumPy array and ensure it's in the correct shape
-    audio_np = audio_tensor.cpu().numpy()
-    if audio_np.ndim == 2:
-        audio_np = audio_np
     pipe = pipeline(
         "automatic-speech-recognition",
@@ -40,13 +17,12 @@ def transcribe(audio):
         chunk_length_s=30,
         device=device,
     )
-    # prediction = pipe(audio_np)["text"]
-    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    sample = ds[0]["audio"]
-    prediction = pipe(sample.copy(), batch_size=8)["text"]
     print(prediction)
     return prediction

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 def transcribe(audio):
     pipe = pipeline(
         "automatic-speech-recognition",
         chunk_length_s=30,
         device=device,
     )
+    # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    # sample = ds[0]["audio"]
+    # prediction = pipe(sample.copy(), batch_size=8)["text"]
+    prediction = pipe(audio)["text"]
     print(prediction)
     return prediction