Spaces:
Running
Running
Completely new app.py, use torchaudio
Browse files- README.md +27 -0
- app.py +35 -12
- requirements.txt +2 -1
README.md
CHANGED
@@ -12,3 +12,30 @@ short_description: Transcribing the audio file with Whisper
|
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
15 |
+
|
16 |
+
# Audio Transcription with Whisper
|
17 |
+
|
18 |
+
This Hugging Face Space uses the `openai/whisper-large-v3` model to transcribe audio files (wav, m4a, mp3). It handles long audio files gracefully.
|
19 |
+
|
20 |
+
## How to Use
|
21 |
+
|
22 |
+
1. Upload your audio file or record audio directly in the browser.
|
23 |
+
2. Click the "Transcribe" button.
|
24 |
+
3. The transcribed text will be displayed in the textbox below.
|
25 |
+
|
26 |
+
## Dependencies
|
27 |
+
|
28 |
+
- `transformers`
|
29 |
+
- `torch`
|
30 |
+
- `torchaudio`
|
31 |
+
- `gradio`
|
32 |
+
|
33 |
+
## Model
|
34 |
+
|
35 |
+
`openai/whisper-large-v3`
|
36 |
+
|
37 |
+
## Notes
|
38 |
+
|
39 |
+
- This space is designed to handle long audio files.
|
40 |
+
- The audio is resampled to 16kHz if necessary.
|
41 |
+
- Error messages are displayed if transcription fails.
|
app.py
CHANGED
@@ -1,23 +1,46 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# Load the Whisper pipeline
|
6 |
-
|
7 |
|
8 |
def transcribe_audio(audio_file):
|
9 |
-
if audio_file is
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
with gr.Blocks() as demo:
|
17 |
-
gr.
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
22 |
|
23 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from transformers import pipeline
|
4 |
+
import torchaudio
|
5 |
+
|
6 |
+
# Check for CUDA availability and set device
|
7 |
+
if torch.cuda.is_available():
|
8 |
+
device = "cuda"
|
9 |
+
else:
|
10 |
+
device = "cpu"
|
11 |
|
12 |
# Load the Whisper pipeline
|
13 |
+
whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
|
14 |
|
15 |
def transcribe_audio(audio_file):
|
16 |
+
if audio_file is None:
|
17 |
+
return "Please upload or record an audio file."
|
18 |
+
|
19 |
+
try:
|
20 |
+
# Load audio using torchaudio to handle various formats and long files
|
21 |
+
audio, sample_rate = torchaudio.load(audio_file)
|
22 |
+
|
23 |
+
# Resample if necessary (Whisper often expects 16kHz)
|
24 |
+
if sample_rate != 16000:
|
25 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
26 |
+
audio = resampler(audio)
|
27 |
+
|
28 |
+
# Transcribe the audio
|
29 |
+
transcription = whisper_pipeline(audio.squeeze().numpy())["text"] # .squeeze() removes extra dimensions
|
30 |
+
|
31 |
+
return transcription
|
32 |
+
|
33 |
+
except Exception as e:
|
34 |
+
return f"An error occurred: {e}"
|
35 |
|
36 |
|
37 |
with gr.Blocks() as demo:
|
38 |
+
with gr.Row():
|
39 |
+
audio_input = gr.Audio(source="upload", type="filepath", label="Upload or Record Audio")
|
40 |
+
|
41 |
+
transcribe_button = gr.Button("Transcribe")
|
42 |
+
transcription_output = gr.Textbox(label="Transcription")
|
43 |
+
|
44 |
+
transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output)
|
45 |
|
46 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
transformers
|
2 |
-
gradio
|
3 |
torch
|
|
|
|
|
|
1 |
transformers
|
|
|
2 |
torch
|
3 |
+
torchaudio
|
4 |
+
gradio
|