Spaces:

NuMessiah
/

WhisperTranscript

Running

NuMessiah commited on Feb 15

Commit

ef03f09

1 Parent(s): edbc770

Completely new app.py, use torchaudio

Files changed (3) hide show

README.md CHANGED Viewed

@@ -12,3 +12,30 @@ short_description: Transcribing the audio file with Whisper
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Audio Transcription with Whisper
+This Hugging Face Space uses the `openai/whisper-large-v3` model to transcribe audio files (wav, m4a, mp3).  It handles long audio files gracefully.
+## How to Use
+1. Upload your audio file or record audio directly in the browser.
+2. Click the "Transcribe" button.
+3. The transcribed text will be displayed in the textbox below.
+## Dependencies
+- `transformers`
+- `torch`
+- `torchaudio`
+- `gradio`
+## Model
+`openai/whisper-large-v3`
+## Notes
+- This space is designed to handle long audio files.
+- The audio is resampled to 16kHz if necessary.
+- Error messages are displayed if transcription fails.

app.py CHANGED Viewed

@@ -1,23 +1,46 @@
 import gradio as gr
 import torch
 from transformers import pipeline
 # Load the Whisper pipeline
-transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base")  # Choose your Whisper size
 def transcribe_audio(audio_file):
-    if audio_file is not None:
-        text = transcriber(audio_file)["text"]
-        return text
-    else:
-        return "No audio file uploaded"
 with gr.Blocks() as demo:
-    gr.Markdown("## Audio Transcription with Whisper")
-    audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-    text_output = gr.Textbox(label="Transcription")
-    btn = gr.Button("Transcribe")
-    btn.click(transcribe_audio, inputs=audio_input, outputs=text_output, return_timestamps=True)
-demo.launch()

 import gradio as gr
 import torch
 from transformers import pipeline
+import torchaudio
+# Check for CUDA availability and set device
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
 # Load the Whisper pipeline
+whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
 def transcribe_audio(audio_file):
+    if audio_file is None:
+        return "Please upload or record an audio file."
+    try:
+        # Load audio using torchaudio to handle various formats and long files
+        audio, sample_rate = torchaudio.load(audio_file)
+        # Resample if necessary (Whisper often expects 16kHz)
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+            audio = resampler(audio)
+        # Transcribe the audio
+        transcription = whisper_pipeline(audio.squeeze().numpy())["text"] # .squeeze() removes extra dimensions
+        return transcription
+    except Exception as e:
+        return f"An error occurred: {e}"
 with gr.Blocks() as demo:
+    with gr.Row():
+        audio_input = gr.Audio(source="upload", type="filepath", label="Upload or Record Audio")
+    transcribe_button = gr.Button("Transcribe")
+    transcription_output = gr.Textbox(label="Transcription")
+    transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output)
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 transformers
-gradio
 torch

 transformers
 torch
+torchaudio
+gradio