File size: 1,516 Bytes
44f2969
7df6e8c
44f2969
1808ded
 
 
44f2969
 
 
 
e473647
1808ded
e473647
1808ded
 
44f2969
 
1808ded
 
44f2969
1808ded
 
7df6e8c
1808ded
 
 
 
7df6e8c
1808ded
 
 
e473647
44f2969
 
1808ded
44f2969
1808ded
44f2969
 
 
1808ded
44f2969
1808ded
 
44f2969
 
 
 
 
1808ded
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import gradio as gr
import soundfile as sf
import torch
import numpy as np
import librosa
from transformers import AutoProcessor, Wav2Vec2BertForCTC

MODEL_NAME = "mikr/w2v-bert-2.0-czech-colab-cv16"

device = 0 if torch.cuda.is_available() else "cpu"

print("device:",device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = Wav2Vec2BertForCTC.from_pretrained(MODEL_NAME).to(device)


def transcribe(audio_path):
    a, s = librosa.load(audio_path, sr=16_000)

    # inputs = processor(a, sampling_rate=s, return_tensors="pt")
    input_values = processor(a, sampling_rate=s, return_tensors="pt").input_features

    with torch.no_grad():
        logits = model(input_values.to(device)).logits
   
    predicted_ids = torch.argmax(logits, dim=-1)

    # transcribe speech
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.File(type="filepath", label="Upload Audio File"),  # Audio file upload
    ],
    outputs="text",
    theme="huggingface",
    title="Czech W2v-BERT 2.0 speech encoder demo - transcribe Czech Audio",
    description=(
        "Transcribe audio inputs with the click of a button! Demo uses the fine-tuned"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) from Facebook W2v-BERT 2.0 speech encoder "
        "and 🤗 Transformers to transcribe audio files of arbitrary length."
    ),
    allow_flagging="never",
)

iface.launch(server_name="0.0.0.0")