File size: 4,648 Bytes
653cae7
 
 
d0f88d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
647230d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710b824
647230d
 
 
 
 
 
d0f88d8
647230d
d0f88d8
 
 
647230d
 
 
 
 
 
d0f88d8
647230d
d0f88d8
 
 
 
 
67cc85c
d0f88d8
 
 
 
 
 
 
 
 
 
647230d
9f1b9c0
 
 
d0f88d8
647230d
 
 
 
 
 
 
 
 
 
 
 
 
d0f88d8
eed141f
d0f88d8
4fd2e4a
647230d
 
 
 
d0f88d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.5")
# import gradio as gr
# import torch
# import librosa
# import soundfile
# import nemo.collections.asr as nemo_asr
# import tempfile
# import os
# import uuid
# import wget
# model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
# wget.download(model_url)

# SAMPLE_RATE = 16000

# # Load pre-trained model
# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
# model.change_decoding_strategy(None)
# model.eval()


# def process_audio_file(file_path):
#     print(file_path)
#     print(SAMPLE_RATE)
#     # Load audio file
#     data, sr = librosa.load(file_path, sr=SAMPLE_RATE)

#     # # Resample if necessary
#     # if sr != SAMPLE_RATE:
#     #     data = librosa.resample(data, sr, SAMPLE_RATE)

#     # Convert to mono channel
#     data = librosa.to_mono(data)
#     return data


# def transcribe(audio):
#     # Handle warning message
    
#     # Process audio file
#     sr, data = audio

#     if sr != SAMPLE_RATE:
#         data = librosa.resample(data, sr, SAMPLE_RATE)

#     with tempfile.TemporaryDirectory() as tmpdir:
#         # Save audio data to a temporary file
#         audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
#         soundfile.write(audio_path, data, SAMPLE_RATE)

#         # Transcribe audio
#         transcriptions = model.transcribe([audio_path])

#         # Extract best hypothesis if transcriptions form a tuple (from RNNT)
#         if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
#             transcriptions = transcriptions[0]

#     return warn_output + transcriptions[0]


# iface = gr.Interface(
#     fn=transcribe,
#     inputs=gr.Audio(sources=["microphone"]),
#     outputs="textbox",
    
    
#     title="NeMo Conformer Transducer Large - English",
#     description="Demo for English speech recognition using Conformer Transducers",
#     allow_flagging='never',
# )
# iface.queue(max_size=10)
# iface.launch()




import gradio as gr
import torch
import librosa
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
import wget
model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
wget.download(model_url)

SAMPLE_RATE = 16000

# Load pre-trained model
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
model.change_decoding_strategy(None)
model.eval()


def process_audio_file(file_path):
    # Load audio file
    data, sr = librosa.load(file_path)

    # Resample if necessary
    if sr != SAMPLE_RATE:
        data = librosa.resample(data, sr, SAMPLE_RATE)

    # Convert to mono channel
    data = librosa.to_mono(data)
    return data


def transcribe(microphone_audio, uploaded_audio):
    # Handle warning message
    warn_output = ""
    if microphone_audio and uploaded_audio:
        warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
                       "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
        audio_file = microphone_audio

    elif not microphone_audio and not uploaded_audio:
        return "ERROR: You have to either use the microphone or upload an audio file"

    elif microphone_audio:
        audio_file = microphone_audio
    else:
        audio_file = uploaded_audio

    # Process audio file
    audio_data = process_audio_file(audio_file)

    with tempfile.TemporaryDirectory() as tmpdir:
        # Save audio data to a temporary file
        audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
        soundfile.write(audio_path, audio_data, SAMPLE_RATE)

        # Transcribe audio
        transcriptions = model.transcribe([audio_path])

        # Extract best hypothesis if transcriptions form a tuple (from RNNT)
        if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
            transcriptions = transcriptions[0]

    return warn_output + transcriptions[0]


iface = gr.Interface(
    fn=transcribe,
    inputs=[gr.inputs.Audio(source="microphone", type='filepath', optional=True),
        gr.inputs.Audio(source="upload", type='filepath', optional=True)],
    outputs="text",
    
    title="NeMo Conformer Transducer Large - English",
    description="Demo for English speech recognition using Conformer Transducers",
    allow_flagging='never',
)
iface.launch(enable_queue=True)