ahmedJaafari commited on
Commit
f3c4afb
·
1 Parent(s): cc028cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -33
app.py CHANGED
@@ -1,36 +1,59 @@
1
- import soundfile as sf
2
- import torch
3
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
  import gradio as gr
5
- import sox
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def convert(inputfile, outfile):
9
- sox_tfm = sox.Transformer()
10
- sox_tfm.set_output_format(
11
- file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
12
- )
13
- sox_tfm.build(inputfile, outfile)
14
- api_token = os.getenv("AnnarabicToken")
15
- model_name = "ahmedJaafari/Annarabic3.2"
16
- processor = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=api_token)
17
- model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=api_token)
18
- def parse_transcription(wav_file):
19
- filename = wav_file.name.split('.')[0]
20
- convert(wav_file.name, filename + "16k.wav")
21
- speech, _ = sf.read(filename + "16k.wav")
22
- input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
23
- logits = model(input_values).logits
24
- predicted_ids = torch.argmax(logits, dim=-1)
25
- transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
26
- return transcription
27
- output = gr.outputs.Textbox(label="The transcript")
28
- input_ = gr.inputs.Audio(source="microphone", type="file")
29
- gr.Interface(parse_transcription, inputs=input_, outputs=[output],
30
- analytics_enabled=False,
31
- show_tips=False,
32
- theme='huggingface',
33
- layout='vertical',
34
- title="Speech Recognition for Darija",
35
- description="Speech Recognition Live Demo for Darija",
36
- enable_queue=True).launch( inline=False)
 
 
 
 
1
  import gradio as gr
2
+ from transformers.file_utils import cached_path, hf_bucket_url
3
  import os
4
+ from transformers import Wav2Vec2Processor, AutoModelForCTC
5
+ from datasets import load_dataset
6
+ import torch
7
+ import kenlm
8
+ import torchaudio
9
+
10
+ cache_dir = './cache/'
11
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
12
+ processor2 = Wav2Vec2Processor.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
13
+ model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
14
+
15
+ # define function to read in sound file
16
+ def speech_file_to_array_fn(path, max_seconds=10):
17
+ batch = {"file": path}
18
+ speech_array, sampling_rate = torchaudio.load(batch["file"])
19
+ if sampling_rate != 16000:
20
+ transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
21
+ new_freq=16000)
22
+ speech_array = transform(speech_array)
23
+ speech_array = speech_array[0]
24
+ if max_seconds > 0:
25
+ speech_array = speech_array[:max_seconds*16000]
26
+ batch["speech"] = speech_array.numpy()
27
+ batch["sampling_rate"] = 16000
28
+ return batch
29
+
30
+ # tokenize
31
+ def inference(audio):
32
+ # read in sound file
33
+ # load dummy dataset and read soundfiles
34
+ ds = speech_file_to_array_fn(audio.name)
35
+ # infer model
36
+ input_values = processor(
37
+ ds["speech"],
38
+ sampling_rate=ds["sampling_rate"],
39
+ return_tensors="pt"
40
+ ).input_values
41
+ # decode ctc output
42
+ with torch.no_grad():
43
+ logits = model(input_values).logits
44
+
45
+ #pred_ids = torch.argmax(logits, dim=-1)
46
+ h = logits.numpy()[0,:,:]
47
+ v = np.pad(h, [0, 2], mode='constant')
48
+
49
+ output = processor.decode(v).text
50
+
51
+ return output[:-4]
52
 
53
+ inputs = gr.inputs.Audio(label="Input Audio", type="file")
54
+ outputs = gr.outputs.Textbox(label="Output Text")
55
+ title = "Annarabic Speech Recognition System"
56
+ description = "Gradio demo for Annarabic ASR. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files"
57
+ article = "<a href='https://huggingface.co/ahmedJaafari' target='_blank'>Pretrained model</a></p>"
58
+ #examples=[['t1_0001-00010.wav'], ['t1_utt000000042.wav'], ['t2_0000006682.wav']]
59
+ gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()