from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import torch import gradio as gr import librosa import os import subprocess # Install system dependencies subprocess.run(["apt-get", "update"], check=True) subprocess.run(["apt-get", "install", "-y", "espeak"], check=True) # load model and processor processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") # define prediction function def audio2phoneme(audio_path): audio, sr = librosa.load(audio_path, sr=16000) input_values = processor(audio, return_tensors="pt", padding=True).input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return ' '.join(transcription) app = gr.Interface( fn=audio2phoneme, inputs=gr.Audio(sources=["upload","microphone"], type="filepath"), outputs=gr.Textbox(label="Phoneme Transcription", show_copy_button=True, show_label=True), description="Get phonemes from audio", title="Audio to Phoneme Transcription using facebook/wav2vec2-lv-60-espeak-cv", ) # start space app.launch(share=True)