pgilles commited on
Commit
d4cdf1c
·
1 Parent(s): fc76259

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -25
app.py CHANGED
@@ -3,8 +3,6 @@
3
  from transformers import pipeline
4
  #, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, WhisperFeatureExtractor, WhisperProcessor, WhisperModel, WhisperTokenizer
5
  import torch
6
- #import pyctcdecode
7
- #import kenlm
8
  import gradio as gr
9
  import librosa
10
  import os
@@ -12,15 +10,9 @@ import time
12
 
13
  #Loading the model and the tokenizer
14
  token_key = os.environ.get("HUGGING_FACE_HUB_TOKEN")
15
- #model_name = "unilux/wav2vec-xls-r-Luxembourgish20-with-LM"
16
- #model_name = "unilux/wav2vec-xlsr-300m-Luxembourgish-with-LM"
17
 
18
  model_name = "pgilles/whisper-large-v2-lb_cased_01"
19
 
20
- #tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name, use_auth_token=token_key)
21
- #model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=token_key)
22
- #processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name, use_auth_token=token_key)
23
-
24
  #tokenizer = WhisperTokenizer.from_pretrained(model_name)
25
  #model = WhisperModel.from_pretrained(model_name)
26
  #processor = WhisperProcessor.from_pretrained(model_name)
@@ -28,12 +20,6 @@ model_name = "pgilles/whisper-large-v2-lb_cased_01"
28
  #p = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder, use_auth_token=token_key)
29
  p = pipeline("automatic-speech-recognition", model=model_name, device=0, use_auth_token=token_key)
30
 
31
- #p = pipeline("automatic-speech-recognition", model=model_name, use_auth_token = token_key)
32
- #p = pipeline("automatic-speech-recognition", model=model_name, use_auth_token = True)
33
-
34
- #tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
35
- #model = Wav2Vec2ForCTC.from_pretrained(model_name)
36
-
37
  def load_data(input_file):
38
 
39
  """ Function for resampling to ensure that the speech input is sampled at 16KHz.
@@ -44,22 +30,21 @@ def load_data(input_file):
44
  #speech = librosa.effects.trim(speech, top_db= 10)
45
  return speech
46
 
47
- def asr_pipe(input_file):
48
- load_data(input_file)
49
- transcription = p(input_file, chunk_length_s=29)["text"]
50
  return transcription
51
 
52
- #input1=gr.inputs.Audio(source="microphone", type='filepath', label="Click and Start Speaking...")
53
- #input2=gr.inputs.Audio(source="upload", type='filepath', label="Load your own file...")
54
- #inputs=[input1, input2]
55
 
56
- gr.Interface(asr_pipe,
57
- #inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Hei kënnt Dir Är Sprooch iwwert de Mikro ophuelen"),
58
- inputs=gr.inputs.Audio(source="upload", type='filepath', label="Load your own file..."),
59
- #inputs = inputs,
 
 
60
  outputs = gr.outputs.Textbox(label="Erkannten Text"),
61
  title="Sproocherkennung fir d'Lëtzebuergescht @uni.lu",
62
- description = "Dës App convertéiert Är geschwate Sprooch an de (méi oder manner richtegen ;-)) Text! Dir kënnt Iech selwer iwwer de Mikro ophuelen (am beschten 5 bis 10 Sekonnen), eng Datei eroplueden oder e Beispill ënnen auswielen.",
63
  examples = [["Chamber2022_1.wav"], ["Chamber2022_2.wav"], ["Chamber2022_3.wav"], ["Chamber2022_4.wav"]],
64
  theme="default").launch()
65
 
 
3
  from transformers import pipeline
4
  #, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, WhisperFeatureExtractor, WhisperProcessor, WhisperModel, WhisperTokenizer
5
  import torch
 
 
6
  import gradio as gr
7
  import librosa
8
  import os
 
10
 
11
  #Loading the model and the tokenizer
12
  token_key = os.environ.get("HUGGING_FACE_HUB_TOKEN")
 
 
13
 
14
  model_name = "pgilles/whisper-large-v2-lb_cased_01"
15
 
 
 
 
 
16
  #tokenizer = WhisperTokenizer.from_pretrained(model_name)
17
  #model = WhisperModel.from_pretrained(model_name)
18
  #processor = WhisperProcessor.from_pretrained(model_name)
 
20
  #p = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder, use_auth_token=token_key)
21
  p = pipeline("automatic-speech-recognition", model=model_name, device=0, use_auth_token=token_key)
22
 
 
 
 
 
 
 
23
  def load_data(input_file):
24
 
25
  """ Function for resampling to ensure that the speech input is sampled at 16KHz.
 
30
  #speech = librosa.effects.trim(speech, top_db= 10)
31
  return speech
32
 
33
+ def asr_pipe(input_file, input_file_microphone, chunks):
34
+ input_file = input_file_microphone if input_file_microphone else input_file
35
+ transcription = p(input_file, chunk_length_s= chunks)["text"]
36
  return transcription
37
 
 
 
 
38
 
39
+ gr.Interface(fn = asr_pipe,
40
+ inputs=[
41
+ gr.inputs.Audio(source="upload", type='filepath', label="Load your own file...", optional = True),
42
+ gr.inputs.Audio(source="microphone", type="filepath", label="Record from microphone", optional = True),
43
+ gr.Slider(minimum=3, maximum=32, value=30, step=1, label="Chunk Length")
44
+ ]
45
  outputs = gr.outputs.Textbox(label="Erkannten Text"),
46
  title="Sproocherkennung fir d'Lëtzebuergescht @uni.lu",
47
+ description = "Dës App convertéiert Är geschwate Sprooch an de (méi oder manner richtegen ;-)) Text! Dir kënnt Iech selwer iwwer de Mikro ophuelen, eng Datei eroplueden oder e Beispill ënnen auswielen.",
48
  examples = [["Chamber2022_1.wav"], ["Chamber2022_2.wav"], ["Chamber2022_3.wav"], ["Chamber2022_4.wav"]],
49
  theme="default").launch()
50