File size: 1,773 Bytes
da95d3d ef1a65c da95d3d ef1a65c da95d3d ef1a65c da95d3d ef1a65c da95d3d ef1a65c da95d3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
warnings.filterwarnings("ignore")
#load wav2vec2 tokenizer and model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fastapi import FastAPI, HTTPException, File
from transformers import pipeline
pipe = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(4, 2))
token = os.getenv()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_checkpoint = 'Finnish-NLP/t5x-small-nl24-finnish'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token'))
model = AutoModelForSeq2SeqLM.from_pretrained('Finnish-NLP/case_correction_model', from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)
# define speech-to-text function
def asr_transcript(audio):
text = ""
if audio:
text = pipe(audio.name)
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=128)
case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"text_asr": text['text'], "text_case_corrected": case_corrected_text}
else:
return "File not valid"
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Speech-to-Text with HuggingFace+Wav2Vec2",
description="Upload an audio clip, and let AI do the hard work of transcribing",
inputs=gr.inputs.Audio(label="Upload Audio File", type="file"),
outputs=gr.outputs.Textbox(label="Auto-Transcript"),
)
gradio_ui.launch() |