flozi00 commited on
Commit
5f6cbd7
·
1 Parent(s): c906256

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -8,9 +8,9 @@ import librosa
8
  import time
9
 
10
 
11
- lmID = "aware-ai/german-lowercase-wiki-4gram"
12
  decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
13
- p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-xls-r-300m-german-lowercase", decoder=decoder)
14
  ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
15
 
16
  vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
@@ -36,24 +36,28 @@ def translate(src, tgt, text):
36
  return result
37
 
38
  def transcribe(audio):
 
39
  sampling_rate = 16000
40
  start_time = time.time()
41
  audio, sr = librosa.load(audio, sr=sampling_rate)
42
- print("--- %s seconds audio loading ---" % (time.time() - start_time))
43
  start_time = time.time()
44
  speech_timestamps = get_speech_timestamps(audio, vadmodel, sampling_rate=sampling_rate)
45
- print("--- %s seconds audio timestamps---" % (time.time() - start_time))
46
  start_time = time.time()
47
  chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
48
- print("--- %s seconds audio chunking---" % (time.time() - start_time))
49
  start_time = time.time()
50
  transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
51
- print("--- %s seconds audio transcription ---" % (time.time() - start_time))
52
  start_time = time.time()
53
  punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
54
- print("--- %s seconds audio formatting ---" % (time.time() - start_time))
 
 
 
55
 
56
- return transcribed, punctuated
57
 
58
  def get_asr_interface():
59
  return gr.Interface(
@@ -62,6 +66,7 @@ def get_asr_interface():
62
  gr.inputs.Audio(source="microphone", type="filepath")
63
  ],
64
  outputs=[
 
65
  "textbox",
66
  "textbox"
67
  ])
 
8
  import time
9
 
10
 
11
+ lmID = "aware-ai/german-lowercase-4gram-kenlm"
12
  decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
13
+ p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-base-german-lowercase", decoder=decoder)
14
  ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
15
 
16
  vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
 
36
  return result
37
 
38
  def transcribe(audio):
39
+ log = ""
40
  sampling_rate = 16000
41
  start_time = time.time()
42
  audio, sr = librosa.load(audio, sr=sampling_rate)
43
+ log += "--- %s seconds audio loading ---" + str(time.time() - start_time)
44
  start_time = time.time()
45
  speech_timestamps = get_speech_timestamps(audio, vadmodel, sampling_rate=sampling_rate)
46
+ log += "\n--- %s seconds audio timestamps---" + str(time.time() - start_time)
47
  start_time = time.time()
48
  chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
49
+ log += "\n--- %s seconds audio chunking---" + str(time.time() - start_time)
50
  start_time = time.time()
51
  transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
52
+ log += "\n--- %s seconds audio transcription ---" + str(time.time() - start_time)
53
  start_time = time.time()
54
  punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
55
+ log += "\n--- %s seconds audio formatting ---" + str(time.time() - start_time)
56
+ start_time = time.time()
57
+ p(audio, chunk_length_s=20, stride_length_s=(0, 0))
58
+ log += "\n--- %s seconds full asr ---" + str(time.time() - start_time)
59
 
60
+ return transcribed, punctuated, log
61
 
62
  def get_asr_interface():
63
  return gr.Interface(
 
66
  gr.inputs.Audio(source="microphone", type="filepath")
67
  ],
68
  outputs=[
69
+ "textbox",
70
  "textbox",
71
  "textbox"
72
  ])