NuMessiah commited on
Commit
ef03f09
·
1 Parent(s): edbc770

Completely new app.py, use torchaudio

Browse files
Files changed (3) hide show
  1. README.md +27 -0
  2. app.py +35 -12
  3. requirements.txt +2 -1
README.md CHANGED
@@ -12,3 +12,30 @@ short_description: Transcribing the audio file with Whisper
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ # Audio Transcription with Whisper
17
+
18
+ This Hugging Face Space uses the `openai/whisper-large-v3` model to transcribe audio files (wav, m4a, mp3). It handles long audio files gracefully.
19
+
20
+ ## How to Use
21
+
22
+ 1. Upload your audio file or record audio directly in the browser.
23
+ 2. Click the "Transcribe" button.
24
+ 3. The transcribed text will be displayed in the textbox below.
25
+
26
+ ## Dependencies
27
+
28
+ - `transformers`
29
+ - `torch`
30
+ - `torchaudio`
31
+ - `gradio`
32
+
33
+ ## Model
34
+
35
+ `openai/whisper-large-v3`
36
+
37
+ ## Notes
38
+
39
+ - This space is designed to handle long audio files.
40
+ - The audio is resampled to 16kHz if necessary.
41
+ - Error messages are displayed if transcription fails.
app.py CHANGED
@@ -1,23 +1,46 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import pipeline
 
 
 
 
 
 
 
4
 
5
  # Load the Whisper pipeline
6
- transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base") # Choose your Whisper size
7
 
8
  def transcribe_audio(audio_file):
9
- if audio_file is not None:
10
- text = transcriber(audio_file)["text"]
11
- return text
12
- else:
13
- return "No audio file uploaded"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  with gr.Blocks() as demo:
17
- gr.Markdown("## Audio Transcription with Whisper")
18
- audio_input = gr.Audio(type="filepath", label="Upload Audio File")
19
- text_output = gr.Textbox(label="Transcription")
20
- btn = gr.Button("Transcribe")
21
- btn.click(transcribe_audio, inputs=audio_input, outputs=text_output, return_timestamps=True)
 
 
22
 
23
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
  from transformers import pipeline
4
+ import torchaudio
5
+
6
+ # Check for CUDA availability and set device
7
+ if torch.cuda.is_available():
8
+ device = "cuda"
9
+ else:
10
+ device = "cpu"
11
 
12
  # Load the Whisper pipeline
13
+ whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=device)
14
 
15
  def transcribe_audio(audio_file):
16
+ if audio_file is None:
17
+ return "Please upload or record an audio file."
18
+
19
+ try:
20
+ # Load audio using torchaudio to handle various formats and long files
21
+ audio, sample_rate = torchaudio.load(audio_file)
22
+
23
+ # Resample if necessary (Whisper often expects 16kHz)
24
+ if sample_rate != 16000:
25
+ resampler = torchaudio.transforms.Resample(sample_rate, 16000)
26
+ audio = resampler(audio)
27
+
28
+ # Transcribe the audio
29
+ transcription = whisper_pipeline(audio.squeeze().numpy())["text"] # .squeeze() removes extra dimensions
30
+
31
+ return transcription
32
+
33
+ except Exception as e:
34
+ return f"An error occurred: {e}"
35
 
36
 
37
  with gr.Blocks() as demo:
38
+ with gr.Row():
39
+ audio_input = gr.Audio(source="upload", type="filepath", label="Upload or Record Audio")
40
+
41
+ transcribe_button = gr.Button("Transcribe")
42
+ transcription_output = gr.Textbox(label="Transcription")
43
+
44
+ transcribe_button.click(transcribe_audio, inputs=audio_input, outputs=transcription_output)
45
 
46
+ demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  transformers
2
- gradio
3
  torch
 
 
 
1
  transformers
 
2
  torch
3
+ torchaudio
4
+ gradio