akki2825 commited on
Commit
4720733
·
1 Parent(s): 94c1047

Update run.py

Browse files
Files changed (1) hide show
  1. run.py +27 -16
run.py CHANGED
@@ -2,6 +2,10 @@ from deepspeech import Model
2
  import gradio as gr
3
  import numpy as np
4
  import urllib.request
 
 
 
 
5
 
6
  model_file_path = "deepspeech-0.9.3-models.pbmm"
7
  lm_file_path = "deepspeech-0.9.3-models.scorer"
@@ -20,26 +24,33 @@ model.setScorerAlphaBeta(lm_alpha, lm_beta)
20
  model.setBeamWidth(beam_width)
21
 
22
 
23
- def reformat_freq(sr, y):
24
- if sr not in (
25
- 48000,
26
- 16000,
27
- ): # Deepspeech only supports 16k, (we convert 48k -> 16k)
28
- raise ValueError("Unsupported rate", sr)
29
- if sr == 48000:
30
- y = (
31
- ((y / max(np.max(y), 1)) * 32767)
32
- .reshape((-1, 3))
33
- .mean(axis=1)
34
- .astype("int16")
35
- )
36
- sr = 16000
37
- return sr, y
38
 
39
 
40
  def transcribe(audio_file):
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- text = model.stt(audio_file)
43
  return text
44
 
45
 
 
2
  import gradio as gr
3
  import numpy as np
4
  import urllib.request
5
+ import wave
6
+ import subprocess
7
+ import sys
8
+ import shlex
9
 
10
  model_file_path = "deepspeech-0.9.3-models.pbmm"
11
  lm_file_path = "deepspeech-0.9.3-models.scorer"
 
24
  model.setBeamWidth(beam_width)
25
 
26
 
27
+ def convert_samplerate(audio_path, desired_sample_rate):
28
+ sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate)
29
+ try:
30
+ output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
31
+ except subprocess.CalledProcessError as e:
32
+ raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr))
33
+ except OSError as e:
34
+ raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror))
35
+
36
+ return desired_sample_rate, np.frombuffer(output, np.int16)
37
+
 
 
 
 
38
 
39
 
40
  def transcribe(audio_file):
41
+ desired_sample_rate = model.sampleRate()
42
+ fin = wave.open(audio_file, 'rb')
43
+ fs_orig = fin.getframerate()
44
+ if fs_orig != desired_sample_rate:
45
+ print('Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.'.format(fs_orig, desired_sample_rate), file=sys.stderr)
46
+ fs_new, audio = convert_samplerate(audio_file, desired_sample_rate)
47
+ else:
48
+ audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
49
+
50
+ audio_length = fin.getnframes() * (1/fs_orig)
51
+ fin.close()
52
 
53
+ text = model.stt(audio)
54
  return text
55
 
56