Sermas / app.py
xsarasola's picture
Update app.py
3b71270 verified
raw
history blame
2.33 kB
import gradio as gr
import re
import subprocess
import math
import shutil
import soundfile as sf
import tempfile
import os
import requests
import time
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def transcribe_base(audio, language):
start_time = time.time()
d, sr = sf.read(audio)
data = {'audio': d.tolist(),
'sampling_rate': sr,
'language': language}
response = requests.post(os.getenv("api_url"), json=data).json()
transcription = response["text"]
speaker_class_string = response["speaker_class_string"]
end_time = time.time()
print("-"*50)
print(len(data["audio"])/float(sr))
print(end_time-start_time)
print("-"*50)
return transcription, speaker_class_string
def transcribe(audio_microphone, audio_upload, language):
print("Transcription request")
print(audio_microphone, audio_upload, language)
audio = audio_microphone if audio_microphone is not None else audio_upload
return transcribe_base(audio, language)
demo = gr.Blocks()
with demo:
gr.Markdown("# Speech recognition using Whisper models")
gr.Markdown("Orai NLP Technologies")
with gr.Tab("Trancribe Audio"):
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Audio(sources="upload", type="filepath"),
gr.Dropdown(choices=[("Basque", "eu"),
("Spanish", "es"),
("English", "en")],
#("French", "fr"),
#("Italian", "it"),
value="eu")
],
outputs=[
gr.Textbox(label="Transcription", autoscroll=False),
gr.Textbox(label="Speaker Identification", autoscroll=False)
],
allow_flagging="never",
)
demo.queue(max_size=1)
demo.launch(share=False, max_threads=3, auth=(os.getenv("username"), os.getenv("password")), auth_message="Please provide a username and a password.")