vakyansh commited on
Commit
647230d
·
1 Parent(s): 27cd801

Adding hindi conformer model

Browse files
Files changed (4) hide show
  1. app.py +81 -0
  2. packages.txt +2 -0
  3. pre-requirements.txt +2 -0
  4. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import soundfile
5
+ import nemo.collections.asr as nemo_asr
6
+ import tempfile
7
+ import os
8
+ import uuid
9
+ import wget
10
+ model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
11
+ wget.download(model_url)
12
+
13
+ SAMPLE_RATE = 16000
14
+
15
+ # Load pre-trained model
16
+ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("Conformer-CTC-BPE-Large.nemo")
17
+ model.change_decoding_strategy(None)
18
+ model.eval()
19
+
20
+
21
+ def process_audio_file(file_path):
22
+ # Load audio file
23
+ data, sr = librosa.load(file_path)
24
+
25
+ # Resample if necessary
26
+ if sr != SAMPLE_RATE:
27
+ data = librosa.resample(data, sr, SAMPLE_RATE)
28
+
29
+ # Convert to mono channel
30
+ data = librosa.to_mono(data)
31
+ return data
32
+
33
+
34
+ def transcribe(microphone_audio, uploaded_audio):
35
+ # Handle warning message
36
+ warn_output = ""
37
+ if microphone_audio and uploaded_audio:
38
+ warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
39
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
40
+ audio_file = microphone_audio
41
+
42
+ elif not microphone_audio and not uploaded_audio:
43
+ return "ERROR: You have to either use the microphone or upload an audio file"
44
+
45
+ elif microphone_audio:
46
+ audio_file = microphone_audio
47
+ else:
48
+ audio_file = uploaded_audio
49
+
50
+ # Process audio file
51
+ audio_data = process_audio_file(audio_file)
52
+
53
+ with tempfile.TemporaryDirectory() as tmpdir:
54
+ # Save audio data to a temporary file
55
+ audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
56
+ soundfile.write(audio_path, audio_data, SAMPLE_RATE)
57
+
58
+ # Transcribe audio
59
+ transcriptions = model.transcribe([audio_path])
60
+
61
+ # Extract best hypothesis if transcriptions form a tuple (from RNNT)
62
+ if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
63
+ transcriptions = transcriptions[0]
64
+
65
+ return warn_output + transcriptions[0]
66
+
67
+
68
+ iface = gr.Interface(
69
+ fn=transcribe,
70
+ inputs=[
71
+ gr.inputs.Audio(source="microphone", type='filepath', optional=True),
72
+ gr.inputs.Audio(source="upload", type='filepath', optional=True),
73
+ ],
74
+ outputs="text",
75
+ layout="horizontal",
76
+ theme="huggingface",
77
+ title="NeMo Conformer Transducer Large - English",
78
+ description="Demo for English speech recognition using Conformer Transducers",
79
+ allow_flagging='never',
80
+ )
81
+ iface.launch(enable_queue=True)
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1
pre-requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Cython
2
+ torch
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nemo_toolkit[asr]
2
+ wget