Spaces:

yavuzkomecoglu
/

Turkish-Speech-Recognition

Runtime error

App Files Files Community

yavuzkomecoglu commited on Oct 28, 2021

Commit

8c25af7

1 Parent(s): 108e46c

added Turkish Automatic Speech Recognition demo

Browse files

Files changed (8) hide show

app.py +34 -0
assets/samples/baris_ozcan_sample_1.m4a +0 -0
assets/samples/baris_ozcan_sample_2.wav +0 -0
assets/samples/baris_ozcan_sample_3.m4a +0 -0
assets/samples/common_voice_sample_1378.flac +0 -0
assets/samples/common_voice_sample_1589.flac +0 -0
requirements.txt +7 -0
utils.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import gradio as gr
+from utils import SpeechRecognition
+sp = SpeechRecognition()
+sp.load_model()
+#sample_file = "assets/samples/sample1378.flac"
+def recognition(audio_file):
+    print("audio_file", audio_file.name)
+    speech, rate = sp.load_speech_with_file(audio_file.name)
+    result = sp.predict_audio_file(speech)
+    print(result)
+    return result
+inputs = gr.inputs.Audio(label="Input Audio", type="file")
+outputs = "text"
+title = "Turkish Automatic Speech Recognition"
+description = "Demo for Turkish Automatic Speech Recognition with Huggingface wav2vec Turkish Model. To use it, simply upload your audio, or click one of the examples to load them."
+article = "<p style='text-align: center'>This is the model for <a href='https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-turkish' target='_blank'>m3hrdadfi/wav2vec2-large-xlsr-turkish</a>, a fine-tuned <a href='https://huggingface.co/facebook/wav2vec2-large-xlsr-53' target='_blank'>facebook/wav2vec2-large-xlsr-53</a> model on the <a href='https://commonvoice.mozilla.org/en/datasets' target='_blank'>Turkish Common Voice dataset</a>.<br/>When using this model, make sure that your speech input is sampled at 16kHz.</a></p>"
+examples = [
+    ['assets/samples/common_voice_sample_1378.flac'],
+    ['assets/samples/common_voice_sample_1589.flac'],
+    ['assets/samples/baris_ozcan_sample_1.m4a'],
+    ['assets/samples/baris_ozcan_sample_2.wav'],
+    ['assets/samples/baris_ozcan_sample_3.m4a']
+]
+gr.Interface(recognition, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()

assets/samples/baris_ozcan_sample_1.m4a ADDED Viewed

Binary file (83.4 kB). View file

assets/samples/baris_ozcan_sample_2.wav ADDED Viewed

Binary file (812 kB). View file

assets/samples/baris_ozcan_sample_3.m4a ADDED Viewed

Binary file (67.2 kB). View file

assets/samples/common_voice_sample_1378.flac ADDED Viewed

Binary file (70 kB). View file

assets/samples/common_voice_sample_1589.flac ADDED Viewed

Binary file (57.3 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==2.2.6
+transformers
+datasets
+torchaudio
+librosa
+jiwer
+numpy ==1.20

utils.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import librosa
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_dataset
+import numpy as np
+import re
+chars_to_ignore = [
+    ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
+    "#", "!", "?", "«", "»", "(", ")", "؛", ",", "?", ".", "!", "-", ";", ":", '"',
+    "“", "%", "‘", "�", "–", "…", "_", "”", '“', '„'
+]
+chars_to_mapping = {
+"\u200c": " ", "\u200d": " ", "\u200e": " ", "\u200f": " ", "\ufeff": " ",
+}
+class SpeechRecognition:
+    def __init__(self):
+        print("init SpeechRecognition")
+    def load_model(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish")
+        self.model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-turkish").to(self.device)
+        return self
+    def multiple_replace(self, text, chars_to_mapping):
+        pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
+        return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
+    def remove_special_characters(self, text, chars_to_ignore_regex):
+        text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
+        return text
+    def normalizer(self, batch, chars_to_ignore, chars_to_mapping):
+        chars_to_ignore_regex = f"""[{"".join(chars_to_ignore)}]"""
+        text = batch["sentence"].lower().strip()
+        text = text.replace("\u0307", " ").strip()
+        text = self.multiple_replace(text, chars_to_mapping)
+        text = self.remove_special_characters(text, chars_to_ignore_regex)
+        batch["sentence"] = text
+        return batch
+    def speech_file_to_array_fn(self, batch):
+        speech_array, sampling_rate = torchaudio.load(batch["path"])
+        speech_array = speech_array.squeeze().numpy()
+        speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
+        batch["speech"] = speech_array
+        return batch
+    def predict(self, batch):
+        features = self.processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+        input_values = features.input_values.to(self.device)
+        attention_mask = features.attention_mask.to(self.device)
+        with torch.no_grad():
+            logits = self.model(input_values, attention_mask=attention_mask).logits
+        pred_ids = torch.argmax(logits, dim=-1)
+        batch["predicted"] = self.processor.batch_decode(pred_ids)[0]
+        return batch
+    def predict_audio_file(self, speech):
+        features = self.processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
+        input_values = features.input_values.to(self.device)
+        attention_mask = features.attention_mask.to(self.device)
+        with torch.no_grad():
+            logits = self.model(input_values, attention_mask=attention_mask).logits
+        pred_ids = torch.argmax(logits, dim=-1)
+        transcriptions = self.processor.decode(pred_ids[0])
+        return transcriptions
+    def load_speech_with_file(self, audio_file):
+        speech, rate = librosa.load(audio_file,sr=16000)
+        return speech, rate