import gradio as gr import re import subprocess import math import shutil import soundfile as sf import tempfile import os import requests import time def _return_yt_html_embed(yt_url): video_id = yt_url.split("?v=")[-1] HTML_str = ( f'
' "
" ) return HTML_str def transcribe_base(audio, language): start_time = time.time() d, sr = sf.read(audio) if len(d.shape) == 2: d = d[:,0] data = {'audio': d.tolist(), 'sampling_rate': sr, 'language': language} print(data) response = requests.post(os.getenv("api_url"), json=data).json() result = response["text"] end_time = time.time() print("-"*50) print(len(data["audio"])/float(sr)) print(end_time-start_time) print("-"*50) return result def transcribe(audio_microphone, audio_upload, language): print("Transcription request") print(audio_microphone, audio_upload, language) audio = audio_microphone if audio_microphone is not None else audio_upload return transcribe_base(audio, language) demo = gr.Blocks() with demo: gr.Markdown("# Speech recognition using Whisper models") gr.Markdown("Orai NLP Technologies") with gr.Tab("Trancribe Audio"): iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources="microphone", type="filepath"), gr.Audio(sources="upload", type="filepath"), gr.Dropdown(choices=[("Basque", "eu"), ("Spanish", "es"), ("English", "en")], #("French", "fr"), #("Italian", "it"), value="eu") ], outputs=[ gr.Textbox(label="Transcription", autoscroll=False) ], allow_flagging="never", ) demo.queue(max_size=1) demo.launch(share=False, max_threads=3, auth=(os.getenv("username"), os.getenv("password")), auth_message="Please provide a username and a password.")