peterkros commited on
Commit
4ca61bc
·
verified ·
1 Parent(s): 5c867bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
3
  import torch
4
  import soundfile as sf
5
 
@@ -12,13 +12,14 @@ model = WhisperForConditionalGeneration.from_pretrained(model_name)
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device)
14
 
15
- # Function to handle transcription with language set to English by default
16
- def transcribe(audio_path):
17
- # Load audio from file
18
- audio, sampling_rate = sf.read(audio_path)
19
-
 
20
  # Process the audio to get input features
21
- input_features = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device)
22
 
23
  # Generate transcription with attention_mask and correct input_features
24
  attention_mask = torch.ones(input_features.shape, dtype=torch.long, device=device)
@@ -35,11 +36,11 @@ def transcribe(audio_path):
35
  # Create a Gradio Interface
36
  interface = gr.Interface(
37
  fn=transcribe,
38
- inputs=gr.Audio(sources="upload", type="filepath"),
39
  outputs="text",
40
  title="Whisper Speech-to-Text API",
41
  description="Upload an audio file and get a transcription using OpenAI's Whisper model from Hugging Face."
42
  )
43
 
44
  # Launch the interface as an API
45
- interface.launch()
 
1
  import gradio as gr
2
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  import torch
4
  import soundfile as sf
5
 
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device)
14
 
15
+ def transcribe(audio):
16
+ # Gradio passes audio as a numpy array, so no need to load from file.
17
+ # If the input is a file path, load the audio from the file:
18
+ if isinstance(audio, str): # Assuming it's a file path
19
+ audio, sampling_rate = sf.read(audio)
20
+
21
  # Process the audio to get input features
22
+ input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(device)
23
 
24
  # Generate transcription with attention_mask and correct input_features
25
  attention_mask = torch.ones(input_features.shape, dtype=torch.long, device=device)
 
36
  # Create a Gradio Interface
37
  interface = gr.Interface(
38
  fn=transcribe,
39
+ inputs=gr.Audio(sources="upload", type="numpy"), # Correct handling of audio as numpy array
40
  outputs="text",
41
  title="Whisper Speech-to-Text API",
42
  description="Upload an audio file and get a transcription using OpenAI's Whisper model from Hugging Face."
43
  )
44
 
45
  # Launch the interface as an API
46
+ interface.launch()