Spaces:
Runtime error
Runtime error
File size: 3,754 Bytes
98b5314 7f93957 926bc70 7f93957 1b44681 7f93957 b1a4f5a 926bc70 a2618f3 926bc70 2a9c85c 926bc70 7f93957 01bdfbb 7f93957 7ab8364 4516d66 9869d32 752ff58 03de940 9869d32 bad025d 4516d66 03de940 afc99ad 4516d66 5166d46 4516d66 bad025d 4516d66 07e0994 4516d66 bad025d 07e0994 1b44681 5166d46 bad025d 3ca231d bad025d 9094213 1b44681 1fa0886 4516d66 1b44681 4516d66 7ab8364 98b5314 7f93957 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import time
import random
import whisper
import gradio as gr
from transformers import pipeline
transcription_model = whisper.load_model("base")
def transcribe(audio):
# time.sleep(3)
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the transcription_model
mel = whisper.log_mel_spectrogram(audio).to(transcription_model.device)
# detect the spoken language
_, probs = transcription_model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(transcription_model, mel, options)
return result.text
# p = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
# def transcribe(audio):
# text = p(audio)["text"]
# return text
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history):
bot_message = random.choice(["How are you?", "I wanted to tell you that...", "hehehe", "huihuihuihui", "I'm very hungry"])
history[-1][1] = ""
for character in bot_message:
history[-1][1] += character
time.sleep(0.05)
yield history
css = """
.gradio-container {
font-family: 'IBM Plex Sans', sans-serif;
}
.gr-button {
color: white;
border-color: black;
background: black;
}
.container {
max-width: 730px;
margin: auto;
padding-top: 1.5rem;
}
#chatbot {
min-height: 30rem;
margin-bottom: 15px;
margin-left: auto;
margin-right: auto;
}
#prompt-container {
margin-bottom: 15px;
margin-left: auto;
margin-right: auto;
}
"""
with gr.Blocks(css=css) as demo:
gr.HTML(
"""
<div style="text-align: center; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px;margin-top:5px">
Interview with AI (Really?)
</h1>
</div>
</div>
"""
)
with gr.Box():
chatbot = gr.Chatbot([], show_label=False, elem_id="chatbot").style(height="auto")
with gr.Row(elem_id="prompt-container").style(mobile_collapse=False, equal_height=True):
with gr.Column(scale=0.8):
txt = gr.Textbox(
show_label=False,
placeholder="Type and press enter, or record your response...",
).style(container=False)
with gr.Column(scale=0.2, min_width=0):
send = gr.Button("Send")
with gr.Row(elem_id="audio-container").style(equal_height=True):
with gr.Column(scale=0.8):
recorder = gr.Audio(source="microphone", type="filepath", show_label=False).style(container=False)
with gr.Column(scale=0.2, min_width=0):
speech = gr.Button("Submit speech").style(height="auto")
speech.click(transcribe, inputs=recorder, outputs=txt)
txt.submit(user, [txt, chatbot], [txt, chatbot], queue=False).then(
bot, chatbot, chatbot
)
send.click(user, [txt, chatbot], [txt, chatbot], queue=False).then(
bot, chatbot, chatbot
)
demo.queue()
demo.launch()
|