Spaces:
Sleeping
Sleeping
File size: 5,766 Bytes
e1e27eb 90301be e1e27eb 4d45f42 90301be 4d45f42 90301be 4d45f42 90301be e1e27eb 326a994 e1e27eb e9daf29 90301be e9daf29 90301be a8a920e 90301be e1e27eb 90301be a8a920e 90301be e1e27eb e9daf29 e1e27eb e9daf29 4d45f42 e1e27eb 90301be e9daf29 90301be e9daf29 326a994 e9daf29 e1e27eb 90301be e1e27eb 90301be e1e27eb e9daf29 e1e27eb 90301be e1e27eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import gradio as gr
# from gradio import ChatMessage
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import librosa
import random
import json
import os
from huggingface_hub import InferenceClient
hf_token = os.getenv("HF_Token")
# def get_token():
# with open("credentials.json","r") as f:
# credentials = json.load(f)
# return credentials['token']
# hf_token = get_token()
words_to_guess = [
"elephant",
"rainbow",
"mountain",
"ocean",
"butterfly",
"guitar",
"volcano",
"chocolate",
"kangaroo",
"spaceship",
"whisper",
"pyramid",
"sunflower",
"unicorn",
"jungle",
"diamond",
"castle",
"galaxy",
"wizard",
"tornado"
]
RANDOM_WORD = random.choice(words_to_guess)
client = InferenceClient(
"meta-llama/Meta-Llama-3-8B-Instruct",
token=hf_token)
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
def chat(audio, chat:list, asr_model:str):
status = ""
if asr_model == "openai/whisper-large-v2":
transcription = transcribe_whisper_large_v2(audio)
elif asr_model == "openai/whisper-tiny.en":
transcription = transcribe_whisper_tiny_en(audio)
else:
raise ValueError(f"No Model found with the given choice: {asr_model}")
if RANDOM_WORD in transcription:
status = f"""# YOU WON !! ππ
The Word was: {RANDOM_WORD}
"""
chat.append({'role':'user','content':transcription})
response = client.chat_completion(
messages=chat,
max_tokens=500,
stream=False,
).choices[0].message.content
chat.append({'role':'assistant','content':response})
if RANDOM_WORD in response:
status = f"""# YOU LOST !! ββ
The Word was: {RANDOM_WORD}
"""
return chat, status
def transcribe_whisper_large_v2(audio):
sr, audio = audio
audio = audio.astype(np.float32)
if len(audio.shape) > 2 and audio.shape[1] > 1:
audio = np.mean(audio, axis=1)
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription = processor.tokenizer.normalize(transcription[0])
return transcription
def transcribe_whisper_tiny_en(audio):
sr, audio = audio
audio = audio.astype(np.float32)
if len(audio.shape) > 2 and audio.shape[1] > 1:
audio = np.mean(audio, axis=1)
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
transcription = processor.tokenizer.normalize(transcription[0])
return transcription
def load_model(asr_model_choice:str):
global processor
global model
global model_flag
if asr_model_choice == "openai/whisper-large-v2":
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
model.config.forced_decoder_ids = None
model_flag = "openai/whisper-large-v2"
elif asr_model_choice == "openai/whisper-tiny.en":
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
model_flag = "openai/whisper-tiny.en"
print("Model Loaded: ",model_flag)
# The App
with gr.Blocks() as app:
gr.Markdown("# VoiceBot Game πΉοΈ")
gr.Markdown("Welcome to VoiceBot π, here is how it works")
gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. π")
gr.Markdown("The Game works as follows: The Bot get's an initial word, you have to guess it. You can ask questions. If the bot says the word before you, You Lose! If you say the word first you Win!")
gr.Markdown("Have fun playing arround π")
gr.Markdown("If you have any wishes for models or a general idea, feel free to let me know π")
chatbot = gr.Chatbot(
value=[{
'role':'System',
'content':f"The User tries to guess a word. The User asks you questions about the word and you answer those questions. Try to help the user to find the word by giving very short descriptions. THE WORD TO GUESS IS: {RANDOM_WORD}"
}],
bubble_full_width=False,
type="messages"
)
audio_input = gr.Audio(
sources=['microphone'],
interactive=True,
scale=8
)
status = gr.Markdown()
with gr.Accordion(label="Settings", open=False):
asr_model_choice = gr.Radio(
label="Select ASR Model",
choices=["openai/whisper-large-v2","openai/whisper-tiny.en"],
value="openai/whisper-tiny.en"
)
asr_model_choice.change(load_model, asr_model_choice)
# Event listener for when the audio recording stops
audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=[chatbot, status])
app.launch() |