File size: 2,712 Bytes
d7d1406
 
cd52621
5adfabd
 
d7d1406
 
 
 
 
 
 
 
 
792f488
d7d1406
f153441
 
 
 
 
11aa534
d7d1406
 
 
 
 
 
 
 
 
 
 
 
 
6fd91c0
1f0e5f6
d7d1406
 
 
 
 
 
 
 
2fe8b41
d7d1406
 
 
 
4e24f8f
d7d1406
 
 
d5ee60e
d7d1406
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# from: https://gradio.app/real_time_speech_recognition/

from transformers import pipeline, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import pyctcdecode
import kenlm
import torch
import gradio as gr
import librosa
import os
import time

#Loading the model and the tokenizer
token_key = os.environ.get("HUGGING_FACE_HUB_TOKEN")

model_name = "unilux/Wav2Vec2-large-xlsr-1b-LUXEMBOURGISH33-with-LM"

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name, use_auth_token=token_key)
model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=token_key)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name, use_auth_token=token_key)


pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder, use_auth_token=token_key)

def load_data(input_file):
  
  """ Function for resampling to ensure that the speech input is sampled at 16KHz.
  """
  sampling_rate = 16_000
  #read the file
  speech, sample_rate = librosa.load(input_file, sr=sampling_rate, mono=True)
  #speech = librosa.effects.trim(speech, top_db= 10)
  return speech
    
def asr_pipe(input_file, input_file_microphone, chunks):
  input_file = input_file_microphone if input_file_microphone else input_file
  transcription = pipe(input_file, chunk_length_s= chunks)["text"]
  
  return transcription

inputs = [gr.inputs.Audio(source="upload", type='filepath', label="Eng Audio-Datei eroplueden...", optional = True),
          gr.inputs.Audio(source="microphone", type="filepath", label="... oder direkt mam Mikro ophuelen", optional = True),
          gr.Slider(minimum=3, maximum=32, value=29, step=0.5, label="Chunk Length")]

outputs = [gr.outputs.Textbox(label="Erkannten Text")]

samples = [["Chamber2022_1.wav", "Chamber2022_1.wav", 8], ["Chamber2022_2.wav", "Chamber2022_2.wav", 8], ["Chamber2022_3.wav", "Chamber2022_3.wav", 8], ["Erlieft-a-Verzielt.wav", "Erlieft-a-Verzielt.wav", 8]]

gr.Interface(fn = asr_pipe,
             inputs = inputs,
             outputs = outputs,
             title="Sproocherkennung fir d'LΓ«tzebuergescht @uni.lu, based on wav2vec2 XLS-R-1B",
             description = "DΓ«s App convertΓ©iert Γ„r geschwate Sprooch an de (mΓ©i oder manner richtegen ;-)) Text!",
             examples = samples,
             examples_per_page = 10,
             article = "Beschreiwung: Dir kΓ«nnt Iech selwer iwwer de Mikro ophuelen, eng Datei eroplueden oder e Beispill auswielen. DΓ«se Modell ass trainΓ©iert mam wav2vec 2.0-Algorithmus vu Meta mat enger Milliard Parametern (wav2vec2-large-xls-r-1B).",
             theme="default").launch(share=False, show_error=True)