Spaces:

Finnish-NLP
/

Finnish-Automatic-Speech-Recognition

App Files Files Community

RasmusToivanen commited on May 17, 2022

Commit

af31d45

•

1 Parent(s): ab09d2c

add article, change to gradio 3, remove 300m model

Browse files

Files changed (1) hide show

app.py +19 -13

app.py CHANGED Viewed

@@ -19,18 +19,16 @@ from transformers import pipeline
-pipe_300m = pipeline(model="Finnish-NLP/wav2vec2-xlsr-300m-finnish-lm",chunk_length_s=20, stride_length_s=(3, 3))
-pipe_94m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
 pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model_checkpoint = 'Finnish-NLP/t5x-small-nl24-casing-punctuation-correction'
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token'))
 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)
 # define speech-to-text function
 def asr_transcript(audio, audio_microphone, model_params):
@@ -38,16 +36,14 @@ def asr_transcript(audio, audio_microphone, model_params):
     audio = audio_microphone if audio_microphone else audio
     if audio == None and audio_microphone == None:
-        return "Please provide audio by uploading file or by recording audio with microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading file or by recording audio with microphone by pressing Record (And allow usage of microphone)"
     text = ""
     if audio:
-        if model_params == "1 billion multi":
             text = pipe_1b(audio.name)
-        elif model_params == "94 million fi":
-            text = pipe_94m(audio.name)
-        elif model_params == "300 million multi":
-            text = pipe_300m(audio.name)
         input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
         outputs = model.generate(input_ids, max_length=128)
@@ -58,9 +54,19 @@ def asr_transcript(audio, audio_microphone, model_params):
 gradio_ui = gr.Interface(
     fn=asr_transcript,
-    title="Finnish automatic speech recognition",
-    description="Upload an audio clip, and let AI do the hard work of transcribing",
-    inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["94 million fi", "300 million multi", "1 billion multi"], type="value", default="1 billion multi", label="Select speech recognition model parameter amount", optional=False)],
     outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
 )

+pipe_95m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
 pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction'
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token'))
 model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)
 # define speech-to-text function
 def asr_transcript(audio, audio_microphone, model_params):
     audio = audio_microphone if audio_microphone else audio
     if audio == None and audio_microphone == None:
+        return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
     text = ""
     if audio:
+        if model_params == "1 billion":
             text = pipe_1b(audio.name)
+        elif model_params == "95 million":
+            text = pipe_95m(audio.name)
         input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
         outputs = model.generate(input_ids, max_length=128)
 gradio_ui = gr.Interface(
     fn=asr_transcript,
+    title="Finnish Automatic Speech Recognition",
+    description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
+    article = """
+    This demo includes 2 kinds of models that are run together. First selected ASR model does speech recognition which produces lowercase text without punctuation.
+    After that we run a sequence-to-sequence model which tries to correct casing and punctuation which produces the final output.
+    You can select one of two speech recognition models listed below
+    1. 1 billion, best accuracy but slowest by big margin. Based on multilingual wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2
+    2. 95 million, almost as accurate as 1. but really much faster. Based on finnish wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned
+    More info about the casing+punctuation correction model can be found here https://huggingface.co/Finnish-NLP/t5-small-nl24-casing-punctuation-correction
+    """,
+    inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["95 million","1 billion"], type="value", default="1 billion", label="Select speech recognition model parameter amount", optional=False)],
     outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
 )