import gradio as gr import librosa import soundfile as sf import torch import warnings import os from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer warnings.filterwarnings("ignore") #load wav2vec2 tokenizer and model from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from fastapi import FastAPI, HTTPException, File from transformers import pipeline pipe = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(4, 2)) token = os.getenv() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_checkpoint = 'Finnish-NLP/t5x-small-nl24-finnish' tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=os.environ.get('hf_token')) model = AutoModelForSeq2SeqLM.from_pretrained('Finnish-NLP/case_correction_model', from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device) # define speech-to-text function def asr_transcript(audio): text = "" if audio: text = pipe(audio.name) input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device) outputs = model.generate(input_ids, max_length=128) case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return {"text_asr": text['text'], "text_case_corrected": case_corrected_text} else: return "File not valid" gradio_ui = gr.Interface( fn=asr_transcript, title="Speech-to-Text with HuggingFace+Wav2Vec2", description="Upload an audio clip, and let AI do the hard work of transcribing", inputs=gr.inputs.Audio(label="Upload Audio File", type="file"), outputs=gr.outputs.Textbox(label="Auto-Transcript"), ) gradio_ui.launch()