File size: 1,123 Bytes
94ac2ac
f9a3e58
186a3b7
c1dd4e9
186a3b7
 
f9a3e58
62e68d5
f9a3e58
62e68d5
f9a3e58
62e68d5
f9a3e58
 
 
62e68d5
f9a3e58
 
 
 
 
06c4ac4
f9a3e58
c1dd4e9
f9a3e58
 
 
 
06c4ac4
f9a3e58
94ac2ac
f9a3e58
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import streamlit as st
import librosa
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq



uploaded_file = st.file_uploader("上传文件", type="wav")

processor = AutoProcessor.from_pretrained("Yehor/whisper-small-ukrainian")

model = AutoModelForSpeechSeq2Seq.from_pretrained("Yehor/whisper-small-ukrainian")

def map_to_pred(file_path):
    # load audio file
    audio, _ = librosa.load(file_path)

    # preprocess audio and generate standard
    input_features = processor([audio], return_tensors="pt", sampling_rate=16_000).input_features
    generated_ids = model.generate(inputs=input_features)
    transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
    text = processor.tokenizer._normalize(transcription[0])

    return text
if uploaded_file is not None:
    # convert file object to file path
    file_path = './temp.wav'
    with open(file_path, 'wb') as f:
        f.write(uploaded_file.getbuffer())

    text = map_to_pred(file_path)

    # display results
    st.write('Input audio:', uploaded_file.name)
    st.write('Predicted standard:', text)