flozi00 commited on
Commit
922cd73
·
1 Parent(s): 606f61c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -1
app.py CHANGED
@@ -3,12 +3,21 @@ import gradio as gr
3
  import re
4
  import torch
5
  from pyctcdecode import BeamSearchDecoderCTC
 
 
6
 
7
  lmID = "aware-ai/german-lowercase-wiki-4gram"
8
  decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
9
  p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-xls-r-300m-german-lowercase", decoder=decoder)
10
  ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
11
 
 
 
 
 
 
 
 
12
 
13
  #model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
14
  #tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
@@ -25,7 +34,11 @@ def translate(src, tgt, text):
25
  return result
26
 
27
  def transcribe(audio):
28
- transcribed = p(audio, chunk_length_s=20, stride_length_s=(6, 0))["text"]
 
 
 
 
29
 
30
  punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
31
 
 
3
  import re
4
  import torch
5
  from pyctcdecode import BeamSearchDecoderCTC
6
+ import torch
7
+
8
 
9
  lmID = "aware-ai/german-lowercase-wiki-4gram"
10
  decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
11
  p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-xls-r-300m-german-lowercase", decoder=decoder)
12
  ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
13
 
14
+ vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
15
+ model='silero_vad',
16
+ force_reload=False)
17
+
18
+ (get_speech_timestamps,
19
+ _, read_audio,
20
+ *_) = utils
21
 
22
  #model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
23
  #tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
 
34
  return result
35
 
36
  def transcribe(audio):
37
+ sampling_rate = 16000
38
+ audio, sr = librosa.load(audio, sr=sampling_rate)
39
+ speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=sampling_rate)
40
+ chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
41
+ transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
42
 
43
  punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
44