|
import gradio as gr |
|
import librosa |
|
import soundfile as sf |
|
import torch |
|
import warnings |
|
import os |
|
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
from fastapi import FastAPI, HTTPException, File |
|
|
|
from transformers import pipeline |
|
|
|
|
|
pipe = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(4, 2)) |
|
|
|
token = os.getenv() |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model_checkpoint = 'Finnish-NLP/t5x-small-nl24-finnish' |
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token')) |
|
model = AutoModelForSeq2SeqLM.from_pretrained('Finnish-NLP/case_correction_model', from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device) |
|
|
|
|
|
|
|
def asr_transcript(audio): |
|
|
|
text = "" |
|
|
|
if audio: |
|
text = pipe(audio.name) |
|
|
|
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device) |
|
outputs = model.generate(input_ids, max_length=128) |
|
case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return {"text_asr": text['text'], "text_case_corrected": case_corrected_text} |
|
else: |
|
return "File not valid" |
|
|
|
gradio_ui = gr.Interface( |
|
fn=asr_transcript, |
|
title="Speech-to-Text with HuggingFace+Wav2Vec2", |
|
description="Upload an audio clip, and let AI do the hard work of transcribing", |
|
inputs=gr.inputs.Audio(label="Upload Audio File", type="file"), |
|
outputs=gr.outputs.Textbox(label="Auto-Transcript"), |
|
) |
|
|
|
gradio_ui.launch() |