demavior commited on
Commit
8c7801f
·
verified ·
1 Parent(s): 61b6298

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -22
app.py CHANGED
@@ -1,10 +1,4 @@
1
- import gradio as gr
2
- from transformers import pipeline
3
- import torch
4
  import torchaudio
5
- import numpy as np
6
-
7
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
9
  def transcribe(audio):
10
  # Extract the sample rate and audio data from the tuple
@@ -14,13 +8,20 @@ def transcribe(audio):
14
  if not isinstance(audio_data, np.ndarray):
15
  audio_data = np.array(audio_data)
16
 
 
 
 
17
  # Convert to mono if the audio is stereo
18
- if audio_data.ndim > 1:
19
- audio_data = np.mean(audio_data, axis=0)
 
 
 
 
 
20
 
21
- # Reshape the audio data to match the expected input format (1, num_samples)
22
- if audio_data.ndim == 1:
23
- audio_data = np.expand_dims(audio_data, axis=0)
24
 
25
  pipe = pipeline(
26
  "automatic-speech-recognition",
@@ -29,15 +30,5 @@ def transcribe(audio):
29
  device=device,
30
  )
31
 
32
- prediction = pipe(audio_data)["text"]
33
  return prediction
34
-
35
- gradio_app = gr.Interface(
36
- fn=transcribe,
37
- inputs=gr.Audio(label="Input"),
38
- outputs=gr.Textbox(label="Result"),
39
- title="Transcribed",
40
- )
41
-
42
- if __name__ == "__main__":
43
- gradio_app.launch()
 
 
 
 
1
  import torchaudio
 
 
 
2
 
3
  def transcribe(audio):
4
  # Extract the sample rate and audio data from the tuple
 
8
  if not isinstance(audio_data, np.ndarray):
9
  audio_data = np.array(audio_data)
10
 
11
+ # Convert to a tensor
12
+ audio_tensor = torch.tensor(audio_data)
13
+
14
  # Convert to mono if the audio is stereo
15
+ if audio_tensor.ndim > 1:
16
+ audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)
17
+
18
+ # Resample to 16kHz if necessary
19
+ if sample_rate != 16000:
20
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
21
+ audio_tensor = resampler(audio_tensor)
22
 
23
+ # Ensure the audio tensor is on the correct device
24
+ audio_tensor = audio_tensor.to(device)
 
25
 
26
  pipe = pipeline(
27
  "automatic-speech-recognition",
 
30
  device=device,
31
  )
32
 
33
+ prediction = pipe(audio_tensor)["text"]
34
  return prediction