Spaces:

ahmedJaafari
/

Annarabic

Runtime error

App Files Files

ahmedJaafari commited on Mar 17, 2022

Commit

f3c4afb

1 Parent(s): cc028cf

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -33

app.py CHANGED Viewed

@@ -1,36 +1,59 @@
-import soundfile as sf
-import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import gradio as gr
-import sox
 import os
-def convert(inputfile, outfile):
-    sox_tfm = sox.Transformer()
-    sox_tfm.set_output_format(
-        file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
-    )
-    sox_tfm.build(inputfile, outfile)
-api_token = os.getenv("AnnarabicToken")
-model_name = "ahmedJaafari/Annarabic3.2"
-processor = Wav2Vec2Processor.from_pretrained(model_name, use_auth_token=api_token)
-model = Wav2Vec2ForCTC.from_pretrained(model_name, use_auth_token=api_token)
-def parse_transcription(wav_file):
-    filename = wav_file.name.split('.')[0]
-    convert(wav_file.name, filename + "16k.wav")
-    speech, _ = sf.read(filename + "16k.wav")
-    input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
-    logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
-    return transcription
-output = gr.outputs.Textbox(label="The transcript")
-input_ = gr.inputs.Audio(source="microphone", type="file")
-gr.Interface(parse_transcription, inputs=input_,  outputs=[output],
-             analytics_enabled=False,
-             show_tips=False,
-             theme='huggingface',
-             layout='vertical',
-             title="Speech Recognition for Darija",
-             description="Speech Recognition Live Demo for Darija",
-             enable_queue=True).launch( inline=False)

 import gradio as gr
+from transformers.file_utils import cached_path, hf_bucket_url
 import os
+from transformers import Wav2Vec2Processor, AutoModelForCTC
+from datasets import load_dataset
+import torch
+import kenlm
+import torchaudio
+cache_dir = './cache/'
+processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
+processor2 = Wav2Vec2Processor.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
+model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
+# define function to read in sound file
+def speech_file_to_array_fn(path, max_seconds=10):
+    batch = {"file": path}
+    speech_array, sampling_rate = torchaudio.load(batch["file"])
+    if sampling_rate != 16000:
+      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
+                                                 new_freq=16000)
+      speech_array = transform(speech_array)
+    speech_array = speech_array[0]
+    if max_seconds > 0:
+      speech_array = speech_array[:max_seconds*16000]
+    batch["speech"] = speech_array.numpy()
+    batch["sampling_rate"] = 16000
+    return batch
+# tokenize
+def inference(audio):
+   # read in sound file
+    # load dummy dataset and read soundfiles
+    ds = speech_file_to_array_fn(audio.name)
+    # infer model
+    input_values = processor(
+          ds["speech"],
+          sampling_rate=ds["sampling_rate"],
+          return_tensors="pt"
+    ).input_values
+    # decode ctc output
+    with torch.no_grad():
+      logits = model(input_values).logits
+    #pred_ids = torch.argmax(logits, dim=-1)
+    h = logits.numpy()[0,:,:]
+    v = np.pad(h, [0, 2], mode='constant')
+    output = processor.decode(v).text
+    return output[:-4]
+inputs = gr.inputs.Audio(label="Input Audio", type="file")
+outputs =  gr.outputs.Textbox(label="Output Text")
+title = "Annarabic Speech Recognition System"
+description = "Gradio demo for Annarabic ASR. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files"
+article = "<a href='https://huggingface.co/ahmedJaafari' target='_blank'>Pretrained model</a></p>"
+#examples=[['t1_0001-00010.wav'], ['t1_utt000000042.wav'], ['t2_0000006682.wav']]
+gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()