|
import gradio as gr |
|
import numpy as np |
|
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps |
|
|
|
|
|
def process_audio(audio_input, model): |
|
wav = np.array(audio_input) |
|
probs = get_speech_probs(wav, model, sampling_rate=16_000) |
|
return make_visualization(probs, 512 / 16_000) |
|
|
|
def process_parameters(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms): |
|
return probs2speech_timestamps(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms) |
|
|
|
def main(): |
|
model = None |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
audio_input = gr.inputs.Audio(type="filepath") |
|
button1 = gr.Button("Process Audio") |
|
figure = gr.outputs.Image() |
|
|
|
button1.click(process_audio, inputs=[audio_input, model], outputs=figure) |
|
|
|
with gr.Row(): |
|
probs = gr.State(None) |
|
threshold = gr.inputs.Number(label="Threshold", default=0.5, minimum=0.0, maximum=1.0) |
|
min_speech_duration_ms = gr.inputs.Number(label="Min Speech Duration (ms)", default=250) |
|
min_silence_duration_ms = gr.inputs.Number(label="Min Silence Duration (ms)", default=100) |
|
window_size_samples = gr.inputs.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], default=1536) |
|
speech_pad_ms = gr.inputs.Number(label="Speech Pad (ms)", default=30) |
|
button2 = gr.Button("Process Parameters") |
|
output_text = gr.outputs.Textbox() |
|
|
|
button2.click(process_parameters, inputs=[probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], outputs=output_text) |
|
|
|
demo.launch() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|