import os os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper from flask import Flask, jsonify, request import requests import numpy as np import streamlit as st app = Flask(__name__) @app.route("/") def indexApi(): return jsonify({"output": "okay"}) @app.route("/run", methods=['POST']) def runApi(): audio_url = request.form.get("audio_url") # key = request.form.get("key") # model = request.form.get("model") print(audio_url) if (modelSelection == None): modelSelection = "small" model = whisper.load_model(modelSelection) print(model) # # reject if key not the same # apiKey = st.secrets["Api-Key"] # if apiKey != key: # return jsonify({ # "image_url": image_url, # "model": model, # "result": "Invalid API Key", # }), 400 response = requests.get(audio_url) if response.status_code == requests.codes.ok: # Only attempt to save the file if the request was successful with open("audio.mp3", "wb") as f: f.write(response.content) else: return jsonify({ "result": "Unable to save file, status code: {response.status_code}" , }), 400 # arr = np.asarray(bytearray(response.content), dtype=np.uint8) result = model.transcribe("audio.mp3") return jsonify({ "audio_url": audio_url, "model": model, "result": result["text"], }) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860) # def inference(audio): # audio = whisper.load_audio(audio) # audio = whisper.pad_or_trim(audio) # mel = whisper.log_mel_spectrogram(audio).to(model.device) # _, probs = model.detect_language(mel) # options = whisper.DecodingOptions(fp16 = False) # result = whisper.decode(model, mel, options) # # print(result.text) # return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) # css = """ # .gradio-container { # font-family: 'IBM Plex Sans', sans-serif; # } # .gr-button { # color: white; # border-color: black; # background: black; # } # input[type='range'] { # accent-color: black; # } # .dark input[type='range'] { # accent-color: #dfdfdf; # } # .container { # max-width: 730px; # margin: auto; # padding-top: 1.5rem; # } # .details:hover { # text-decoration: underline; # } # .gr-button { # white-space: nowrap; # } # .gr-button:focus { # border-color: rgb(147 197 253 / var(--tw-border-opacity)); # outline: none; # box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); # --tw-border-opacity: 1; # --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); # --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); # --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); # --tw-ring-opacity: .5; # } # .footer { # margin-bottom: 45px; # margin-top: 35px; # text-align: center; # border-bottom: 1px solid #e5e5e5; # } # .footer>p { # font-size: .8rem; # display: inline-block; # padding: 0 10px; # transform: translateY(10px); # background: white; # } # .dark .footer { # border-color: #303030; # } # .dark .footer>p { # background: #0b0f19; # } # .prompt h4{ # margin: 1.25em 0 .25em 0; # font-weight: bold; # font-size: 115%; # } # .animate-spin { # animation: spin 1s linear infinite; # } # @keyframes spin { # from { # transform: rotate(0deg); # } # to { # transform: rotate(360deg); # } # } # #share-btn-container { # display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; # } # #share-btn { # all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; # } # #share-btn * { # all: unset; # } # """ # block = gr.Blocks(css=css) # with block: # gr.HTML( # """ #
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # #

# Whisper #

#
#

# Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. This demo cuts audio after around 30 secs. #

#

You can skip the queue by using google colab for the space: Open In Colab

#
# """ # ) # with gr.Group(): # with gr.Box(): # with gr.Row().style(mobile_collapse=False, equal_height=True): # audio = gr.Audio( # label="Input Audio", # show_label=False, # source="microphone", # type="filepath" # ) # btn = gr.Button("Transcribe") # text = gr.Textbox(show_label=False, elem_id="result-textarea") # with gr.Group(elem_id="share-btn-container"): # community_icon = gr.HTML(community_icon_html, visible=False) # loading_icon = gr.HTML(loading_icon_html, visible=False) # share_button = gr.Button("Share to community", elem_id="share-btn", visible=False) # btn.click(inference, inputs=[audio], outputs=[text, community_icon, loading_icon, share_button]) # share_button.click(None, [], [], _js=share_js) # gr.HTML(''' # # ''') # block.launch()