import gradio as gr import librosa import soundfile as sf import torch import warnings import os from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer warnings.filterwarnings("ignore") #load wav2vec2 tokenizer and model from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from fastapi import FastAPI, HTTPException, File from transformers import pipeline pipe_95m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3)) pipe_300m = pipeline(model="Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish",chunk_length_s=20, stride_length_s=(3, 3)) pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_auth_token=os.environ.get('hf_token')) model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device) # define speech-to-text function def asr_transcript(audio, audio_microphone, model_params): audio = audio_microphone if audio_microphone else audio if audio == None and audio_microphone == None: return "Please provide audio (wav or mp3) by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)" text = "" if audio: if model_params == "1 billion": text = pipe_1b(audio.name) elif model_params == "300 million": text = pipe_300m(audio.name) elif model_params == "95 million": text = pipe_95m(audio.name) input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device) outputs = model.generate(input_ids, max_length=128) case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return text['text'], case_corrected_text else: return "File not valid" gradio_ui = gr.Interface( fn=asr_transcript, title="Finnish Automatic Speech Recognition", description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.", article = """ This demo includes 2 kinds of models that are run together. First selected ASR model does speech recognition which produces lowercase text without punctuation. After that we run a sequence-to-sequence model which tries to correct casing and punctuation which produces the final output. You can select one of two speech recognition models listed below 1. 1 billion, best accuracy but slowest by big margin. Based on multilingual wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2 3. 300 million, at bar in accuracy as 1. but a lot faster. Based on Uralic wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish 3. 95 million, almost as accurate as 1. but really much faster. Based on Finnish wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned More info about the casing+punctuation correction model can be found here https://huggingface.co/Finnish-NLP/t5-small-nl24-casing-punctuation-correction """, inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["95 million","300 million", "1 billion"], type="value", default="300 million", label="Select speech recognition model parameter amount", optional=False)], outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")] ) gradio_ui.launch()