File size: 4,201 Bytes
da95d3d
 
 
 
 
ef1a65c
da95d3d
 
 
 
 
 
 
ef1a65c
da95d3d
ef1a65c
da95d3d
 
 
0f4c2e3
 
af31d45
89a238b
3dd368d
ef1a65c
da95d3d
 
 
af31d45
98b2006
0f4c2e3
da95d3d
 
28b63dc
da95d3d
0f4c2e3
28b63dc
 
 
3d187fe
da95d3d
 
 
af31d45
3dd368d
89a238b
 
af31d45
 
da95d3d
 
 
 
3dd368d
da95d3d
 
 
 
 
af31d45
 
 
 
 
 
 
 
89a238b
464dfe8
af31d45
 
 
00c84fb
3dd368d
da95d3d
 
28b63dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os 
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer

warnings.filterwarnings("ignore")

#load wav2vec2 tokenizer and model

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from fastapi import FastAPI, HTTPException, File

from transformers import pipeline




pipe_95m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
pipe_300m = pipeline(model="Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish",chunk_length_s=20, stride_length_s=(3, 3))
pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction'    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_auth_token=os.environ.get('hf_token'))
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)

# define speech-to-text function
def asr_transcript(audio, audio_microphone, model_params):
    
    
    audio = audio_microphone if audio_microphone else audio
    
    if audio == None and audio_microphone == None:
        return "Please provide audio (wav or mp3) by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
    text = ""

    if audio:
        if model_params == "1 billion":
            text = pipe_1b(audio.name)
        elif model_params == "300 million":
            text = pipe_300m(audio.name)
        elif model_params == "95 million":
            text = pipe_95m(audio.name)
        
        input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
        outputs = model.generate(input_ids, max_length=128)
        case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return text['text'], case_corrected_text
    else:
        return  "File not valid"
    
gradio_ui = gr.Interface(
    fn=asr_transcript,
    title="Finnish Automatic Speech Recognition",
    description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
    article = """
    This demo includes 2 kinds of models that are run together. First selected ASR model does speech recognition which produces lowercase text without punctuation.
    After that we run a sequence-to-sequence model which tries to correct casing and punctuation which produces the final output.
    You can select one of two speech recognition models listed below
    
    1. 1 billion, best accuracy but slowest by big margin. Based on multilingual wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2
    3. 300 million, at bar in accuracy as 1. but a lot faster. Based on Uralic wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish
    3. 95 million, almost as accurate as 1. but really much faster. Based on Finnish wav2vec2  model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned
    
    More info about the casing+punctuation correction model can be found here https://huggingface.co/Finnish-NLP/t5-small-nl24-casing-punctuation-correction
    """,
    inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["95 million","300 million", "1 billion"], type="value", default="300 million", label="Select speech recognition model parameter amount", optional=False)],
    outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
)

gradio_ui.launch()