File size: 1,562 Bytes
ca62577
1358486
 
 
 
 
 
 
 
 
ca62577
 
 
 
 
 
1358486
ca62577
1358486
ca62577
1358486
 
ca62577
1358486
 
 
ca62577
1358486
ca62577
1358486
ca62577
 
 
 
 
 
 
 
 
 
1358486
 
 
 
 
 
 
 
 
ca62577
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from transformers import Wav2Vec2Processor
from transformers import AutoModelForCTC
from conversationalnlp.models.wav2vec2 import Wav2Vec2Predict
from transformers import Wav2Vec2Processor
from transformers import AutoModelForCTC
from conversationalnlp.models.wav2vec2 import ModelLoader
from conversationalnlp.utils import *
import soundfile as sf
import os

"""
run gradio with 
>>python app.py
"""

audiosavepath = os.getcwd()

pretrained_model = "codenamewei/speech-to-text"

processor = Wav2Vec2Processor.from_pretrained(
    pretrained_model, use_auth_token=True)

model = AutoModelForCTC.from_pretrained(
    pretrained_model,
    use_auth_token=True)

modelloader = ModelLoader(model, processor)

predictor = Wav2Vec2Predict(modelloader)


def greet(audioarray):
    """
    audio array in the following format

    (16000, array([ -5277184,    326400,   -120320, ...,  -5970432, -12745216,
        -6934528], dtype=int32))
    <class 'tuple'>
    """
    audioabspath = os.path.join(audiosavepath, "temp.wav")

    # WORKAROUND: Save to file and reread to get the array shape needed for prediction
    sf.write(audioabspath, audioarray[1], audioarray[0])

    print(f"Audio at path {audioabspath}")
    predictiontexts = predictor.predictfiles([audioabspath])
    outputtext = predictiontexts["predicted_text"][-1] + \
        "\n" + predictiontexts["corrected_text"][-1]

    return outputtext


demo = gr.Interface(fn=greet, inputs="audio",
                    outputs="text", title="Speech-to-Text")

demo.launch()  # share=True)