Spaces:
Sleeping
Sleeping
File size: 1,516 Bytes
44f2969 7df6e8c 44f2969 1808ded 44f2969 e473647 1808ded e473647 1808ded 44f2969 1808ded 44f2969 1808ded 7df6e8c 1808ded 7df6e8c 1808ded e473647 44f2969 1808ded 44f2969 1808ded 44f2969 1808ded 44f2969 1808ded 44f2969 1808ded |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import librosa
from transformers import AutoProcessor, Wav2Vec2BertForCTC
MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"
device = 0 if torch.cuda.is_available() else "cpu"
print("device:",device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)
def transcribe(audio_path):
a, s = librosa.load(audio_path, sr=16_000)
# inputs = processor(a, sampling_rate=s, return_tensors="pt")
input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features
with torch.no_grad():
logits = model(input_values.to(device)).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe speech
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.File(type="filepath", label="Upload Audio File"), # Audio file upload
],
outputs="text",
theme="huggingface",
title="Czech W2v-BERT 2.0 speech encoder demo - transcribe Czech Audio",
description=(
"Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2v-BERT 2.0 speech encoder "
"and 🤗 Transformers to transcribe audio files of arbitrary length."
),
allow_flagging="never",
)
iface.launch(server_name="0.0.0.0")
|