MheniDevs cahya commited on
Commit
e917798
·
0 Parent(s):

Duplicate from indonesian-nlp/luganda-asr

Browse files

Co-authored-by: Cahya Wirawan <[email protected]>

Files changed (6) hide show
  1. .gitattributes +27 -0
  2. 5gram.bin +3 -0
  3. README.md +39 -0
  4. app.py +71 -0
  5. packages.txt +2 -0
  6. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
5gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46e982596dbb0c7c225dd9b88ef89c733ba6d718befc3c3b833b1daddc60816a
3
+ size 11939611
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Luganda ASR
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.0.5
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: indonesian-nlp/luganda-asr
11
+ ---
12
+
13
+ # Configuration
14
+
15
+ `title`: _string_
16
+ Display title for the Space
17
+
18
+ `emoji`: _string_
19
+ Space emoji (emoji-only character allowed)
20
+
21
+ `colorFrom`: _string_
22
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
23
+
24
+ `colorTo`: _string_
25
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
26
+
27
+ `sdk`: _string_
28
+ Can be either `gradio` or `streamlit`
29
+
30
+ `sdk_version` : _string_
31
+ Only applicable for `streamlit` SDK.
32
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
33
+
34
+ `app_file`: _string_
35
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
36
+ Path is relative to the root of the repository.
37
+
38
+ `pinned`: _boolean_
39
+ Whether the Space stays on top of your list.
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+ from pyctcdecode import build_ctcdecoder
5
+ import gradio as gr
6
+ import librosa
7
+ import os
8
+ from multiprocessing import Pool
9
+
10
+
11
+ class KenLM:
12
+ def __init__(self, tokenizer, model_name, num_workers=8, beam_width=128):
13
+ self.num_workers = num_workers
14
+ self.beam_width = beam_width
15
+ vocab_dict = tokenizer.get_vocab()
16
+ self.vocabulary = [x[0] for x in sorted(vocab_dict.items(), key=lambda x: x[1], reverse=False)]
17
+ # Workaround for wrong number of vocabularies:
18
+ self.vocabulary = self.vocabulary[:-2]
19
+ self.decoder = build_ctcdecoder(self.vocabulary, model_name)
20
+
21
+ @staticmethod
22
+ def lm_postprocess(text):
23
+ return ' '.join([x if len(x) > 1 else "" for x in text.split()]).strip()
24
+
25
+ def decode(self, logits):
26
+ probs = logits.cpu().numpy()
27
+ # probs = logits.numpy()
28
+ with Pool(self.num_workers) as pool:
29
+ text = self.decoder.decode_batch(pool, probs)
30
+ text = [KenLM.lm_postprocess(x) for x in text]
31
+ return text
32
+
33
+
34
+ def convert(inputfile, outfile):
35
+ target_sr = 16000
36
+ data, sample_rate = librosa.load(inputfile)
37
+ data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
38
+ sf.write(outfile, data, target_sr)
39
+
40
+
41
+ api_token = os.getenv("API_TOKEN")
42
+ model_name = "indonesian-nlp/wav2vec2-luganda"
43
+ processor = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=api_token)
44
+ model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=api_token)
45
+ kenlm = KenLM(processor.tokenizer, "5gram.bin")
46
+
47
+
48
+ def parse_transcription(wav_file):
49
+ filename = wav_file.name.split('.')[0]
50
+ convert(wav_file.name, filename + "16k.wav")
51
+ speech, _ = sf.read(filename + "16k.wav")
52
+ input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
53
+ with torch.no_grad():
54
+ logits = model(input_values).logits
55
+ transcription = kenlm.decode(logits)[0]
56
+ return transcription
57
+
58
+
59
+ output = gr.outputs.Textbox(label="The transcript")
60
+
61
+ input_ = gr.inputs.Audio(source="microphone", type="file")
62
+
63
+ gr.Interface(parse_transcription, inputs=input_, outputs=[output],
64
+ analytics_enabled=False,
65
+ title="Automatic Speech Recognition for Luganda",
66
+ description="Speech Recognition Live Demo for Luganda",
67
+ article="This demo was built for the "
68
+ "<a href='https://zindi.africa/competitions/mozilla-luganda-automatic-speech-recognition' target='_blank'>Mozilla Luganda Automatic Speech Recognition Competition</a>. "
69
+ "It uses the <a href='https://huggingface.co/indonesian-nlp/wav2vec2-luganda' target='_blank'>indonesian-nlp/wav2vec2-luganda</a> model "
70
+ "which was fine-tuned on Luganda Common Voice speech datasets.",
71
+ enable_queue=True).launch(inline=False, server_name="0.0.0.0", show_tips=False, enable_queue=True)
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ sox
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ soundfile
3
+ torch
4
+ transformers
5
+ librosa
6
+ sentencepiece
7
+ pyctcdecode==0.3.0
8
+ kenlm @ https://github.com/kpu/kenlm/archive/master.zip