Update README.md
Browse files
README.md
CHANGED
@@ -75,56 +75,86 @@ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated sys
|
|
75 |
|
76 |
|
77 |
## How to Use the Model:
|
|
|
|
|
|
|
|
|
|
|
78 |
The model is available for use in the NeMo toolkit [2], and can be used as a pre-trained checkpoint for inference.
|
79 |
|
80 |
### Automatically load the model
|
81 |
|
82 |
```python
|
|
|
83 |
import nemo.collections.asr as nemo_asr
|
84 |
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained(model_name="nvidia/frame_vad_multilingual_marblenet_v2.0")
|
85 |
-
```
|
86 |
|
87 |
-
|
|
|
|
|
|
|
|
|
88 |
|
|
|
|
|
89 |
```bash
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
```
|
100 |
-
|
|
|
101 |
|
102 |
```python
|
103 |
-
import
|
104 |
from nemo.core import typecheck
|
105 |
-
import nemo.collections.asr as nemo_asr
|
106 |
typecheck.set_typecheck_enabled(False)
|
107 |
|
|
|
108 |
ONNX_EXPORT_PATH = "frame_vad_multilingual_marblenet_v2.0.onnx"
|
109 |
|
110 |
-
#
|
111 |
-
vad_model =
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
# Define input example for ONNX export
|
116 |
-
B, F, T = 16, 80, 400 # batch, feature dim, sequence length
|
117 |
inputs = {
|
118 |
-
"processed_signal":
|
119 |
-
"processed_signal_length":
|
120 |
}
|
121 |
|
122 |
-
# Export
|
123 |
torch.onnx.export(
|
124 |
model=vad_model,
|
125 |
args=inputs,
|
126 |
f=ONNX_EXPORT_PATH,
|
127 |
-
input_names=
|
128 |
output_names=["output"],
|
129 |
dynamic_axes={
|
130 |
"processed_signal": {0: "batch_size", 2: "sequence_length"},
|
@@ -133,9 +163,50 @@ torch.onnx.export(
|
|
133 |
}
|
134 |
)
|
135 |
|
136 |
-
# Validate
|
137 |
onnx.checker.check_model(onnx.load(ONNX_EXPORT_PATH))
|
138 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
## Software Integration:
|
140 |
**Runtime Engine(s):**
|
141 |
* NeMo-2.0.0 <br>
|
|
|
75 |
|
76 |
|
77 |
## How to Use the Model:
|
78 |
+
To train, fine-tune or play with the model you will need to install [NVIDIA NeMo](https://github.com/NVIDIA/NeMo).
|
79 |
+
|
80 |
+
```bash
|
81 |
+
pip install -U nemo_toolkit['asr']
|
82 |
+
```
|
83 |
The model is available for use in the NeMo toolkit [2], and can be used as a pre-trained checkpoint for inference.
|
84 |
|
85 |
### Automatically load the model
|
86 |
|
87 |
```python
|
88 |
+
import torch
|
89 |
import nemo.collections.asr as nemo_asr
|
90 |
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained(model_name="nvidia/frame_vad_multilingual_marblenet_v2.0")
|
|
|
91 |
|
92 |
+
# Move the model to GPU if available
|
93 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
94 |
+
vad_model = vad_model.to(device)
|
95 |
+
vad_model.eval()
|
96 |
+
```
|
97 |
|
98 |
+
### Inference with PyTorch
|
99 |
+
First, let's get a sample
|
100 |
```bash
|
101 |
+
wget https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav
|
102 |
+
```
|
103 |
+
Then run the following:
|
104 |
+
|
105 |
+
```python
|
106 |
+
import librosa
|
107 |
+
|
108 |
+
# Load the audio
|
109 |
+
input_signal = librosa.load("2086-149220-0033.wav", sr=16000, mono=True)[0]
|
110 |
+
input_signal = torch.tensor(input_signal).unsqueeze(0).float()
|
111 |
+
input_signal_length = torch.tensor([input_signal.shape[1]]).long()
|
112 |
+
|
113 |
+
# Perform inference
|
114 |
+
with torch.no_grad():
|
115 |
+
torch_outputs = vad_model(
|
116 |
+
input_signal=input_signal.to(device),
|
117 |
+
input_signal_length=input_signal_length.to(device)
|
118 |
+
).cpu()
|
119 |
+
|
120 |
+
# Check output dimensions
|
121 |
+
B, T, C = torch_outputs.shape
|
122 |
+
assert C == 2, "Output channels should be 2"
|
123 |
```
|
124 |
+
|
125 |
+
### Export to ONNX
|
126 |
|
127 |
```python
|
128 |
+
import onnx
|
129 |
from nemo.core import typecheck
|
|
|
130 |
typecheck.set_typecheck_enabled(False)
|
131 |
|
132 |
+
# Output file path for ONNX export
|
133 |
ONNX_EXPORT_PATH = "frame_vad_multilingual_marblenet_v2.0.onnx"
|
134 |
|
135 |
+
# Move everything to CPU
|
136 |
+
vad_model = vad_model.cpu()
|
137 |
+
input_signal = input_signal.cpu()
|
138 |
+
input_signal_length = input_signal_length.cpu()
|
139 |
+
|
140 |
+
# Preprocess input signal
|
141 |
+
processed_signal, processed_signal_length = vad_model.preprocessor(
|
142 |
+
input_signal=input_signal,
|
143 |
+
length=input_signal_length
|
144 |
+
)
|
145 |
|
146 |
# Define input example for ONNX export
|
|
|
147 |
inputs = {
|
148 |
+
"processed_signal": processed_signal,
|
149 |
+
"processed_signal_length": processed_signal_length
|
150 |
}
|
151 |
|
152 |
+
# Export
|
153 |
torch.onnx.export(
|
154 |
model=vad_model,
|
155 |
args=inputs,
|
156 |
f=ONNX_EXPORT_PATH,
|
157 |
+
input_names=list(inputs.keys()),
|
158 |
output_names=["output"],
|
159 |
dynamic_axes={
|
160 |
"processed_signal": {0: "batch_size", 2: "sequence_length"},
|
|
|
163 |
}
|
164 |
)
|
165 |
|
166 |
+
# Validate ONNX model
|
167 |
onnx.checker.check_model(onnx.load(ONNX_EXPORT_PATH))
|
168 |
```
|
169 |
+
|
170 |
+
### Inference with ONNX Runtime
|
171 |
+
```python
|
172 |
+
import onnxruntime
|
173 |
+
|
174 |
+
# Load the ONNX model
|
175 |
+
session = onnxruntime.InferenceSession(
|
176 |
+
ONNX_EXPORT_PATH,
|
177 |
+
providers=["CPUExecutionProvider"]
|
178 |
+
)
|
179 |
+
|
180 |
+
# Prepare input for ONNX Runtime
|
181 |
+
ort_inputs = {
|
182 |
+
input.name: inputs[input.name].numpy()
|
183 |
+
for input in session.get_inputs()
|
184 |
+
}
|
185 |
+
|
186 |
+
# Run inference
|
187 |
+
onnx_outputs = session.run(None, ort_inputs)[0]
|
188 |
+
|
189 |
+
# Compare with PyTorch output
|
190 |
+
for torch_out, onnx_out in zip(torch_outputs, onnx_outputs):
|
191 |
+
torch.testing.assert_close(torch_out, torch.from_numpy(onnx_out), atol=1e-3, rtol=1e-3)
|
192 |
+
print("✅ PyTorch and ONNX Runtime outputs match!")
|
193 |
+
```
|
194 |
+
|
195 |
+
### RTTM Output from Frame-Level Speech Predictions
|
196 |
+
|
197 |
+
To generate RTTM (Rich Transcription Time Marked) files from audio using the pretrained model:
|
198 |
+
```bash
|
199 |
+
python <NEMO_ROOT>/examples/asr/speech_classification/frame_vad_infer.py \
|
200 |
+
--config-path="../conf/vad" \
|
201 |
+
--config-name="frame_vad_infer_postprocess.yaml" \
|
202 |
+
vad.model_path="nvidia/frame_vad_multilingual_marblenet_v2.0" \
|
203 |
+
vad.parameters.shift_length_in_sec=0.02 \
|
204 |
+
prepare_manifest.auto_split=True \
|
205 |
+
prepare_manifest.split_duration=7200 \
|
206 |
+
input_manifest=<Path of manifest file of evaluation data, where audio files should have unique names> \
|
207 |
+
out_manifest_filepath=<Path of output manifest file>
|
208 |
+
```
|
209 |
+
|
210 |
## Software Integration:
|
211 |
**Runtime Engine(s):**
|
212 |
* NeMo-2.0.0 <br>
|