Spaces:

justus-tobias
/

VoiceBot

Sleeping

VoiceBot / app.py

j-tobias

small bug fix

4d45f42 11 months ago

5.77 kB

	import gradio as gr
	# from gradio import ChatMessage
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import numpy as np
	import librosa
	import random
	import json
	import os


	from huggingface_hub import InferenceClient

	hf_token = os.getenv("HF_Token")

	# def get_token():
	# with open("credentials.json","r") as f:
	# credentials = json.load(f)
	# return credentials['token']

	# hf_token = get_token()


	words_to_guess = [
	"elephant",
	"rainbow",
	"mountain",
	"ocean",
	"butterfly",
	"guitar",
	"volcano",
	"chocolate",
	"kangaroo",
	"spaceship",
	"whisper",
	"pyramid",
	"sunflower",
	"unicorn",
	"jungle",
	"diamond",
	"castle",
	"galaxy",
	"wizard",
	"tornado"
	]

	RANDOM_WORD = random.choice(words_to_guess)

	client = InferenceClient(
	"meta-llama/Meta-Llama-3-8B-Instruct",
	token=hf_token)

	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
	processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")


	def chat(audio, chat:list, asr_model:str):

	status = ""

	if asr_model == "openai/whisper-large-v2":
	transcription = transcribe_whisper_large_v2(audio)
	elif asr_model == "openai/whisper-tiny.en":
	transcription = transcribe_whisper_tiny_en(audio)
	else:
	raise ValueError(f"No Model found with the given choice: {asr_model}")

	if RANDOM_WORD in transcription:
	status = f"""# YOU WON !! 🎉🎊
	The Word was: {RANDOM_WORD}
	"""

	chat.append({'role':'user','content':transcription})
	response = client.chat_completion(
	messages=chat,
	max_tokens=500,
	stream=False,
	).choices[0].message.content
	chat.append({'role':'assistant','content':response})
	if RANDOM_WORD in response:
	status = f"""# YOU LOST !! ❌❌
	The Word was: {RANDOM_WORD}
	"""
	return chat, status

	def transcribe_whisper_large_v2(audio):
	sr, audio = audio
	audio = audio.astype(np.float32)
	if len(audio.shape) > 2 and audio.shape[1] > 1:
	audio = np.mean(audio, axis=1)
	audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
	input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
	transcription = processor.tokenizer.normalize(transcription[0])
	return transcription

	def transcribe_whisper_tiny_en(audio):
	sr, audio = audio
	audio = audio.astype(np.float32)
	if len(audio.shape) > 2 and audio.shape[1] > 1:
	audio = np.mean(audio, axis=1)
	audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
	input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
	predicted_ids = model.generate(input_features)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
	transcription = processor.tokenizer.normalize(transcription[0])
	return transcription

	def load_model(asr_model_choice:str):
	global processor
	global model
	global model_flag

	if asr_model_choice == "openai/whisper-large-v2":
	processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
	model.config.forced_decoder_ids = None
	model_flag = "openai/whisper-large-v2"
	elif asr_model_choice == "openai/whisper-tiny.en":
	model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
	processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
	model_flag = "openai/whisper-tiny.en"

	print("Model Loaded: ",model_flag)







	# The App
	with gr.Blocks() as app:

	gr.Markdown("# VoiceBot Game 🕹️")
	gr.Markdown("Welcome to VoiceBot 👋, here is how it works")
	gr.Markdown("This Bot can only be interacted with through your voice. Press record and say something, after stopping the recoding your audio will be processed directly. You have the option to choose between different models. The model you choose influences the Bot's perfomance to understand what you have said. A better perfomance also comes with longer waiting time. 😕")
	gr.Markdown("The Game works as follows: The Bot get's an initial word, you have to guess it. You can ask questions. If the bot says the word before you, You Lose! If you say the word first you Win!")
	gr.Markdown("Have fun playing arround 🎉")
	gr.Markdown("If you have any wishes for models or a general idea, feel free to let me know 🙌")

	chatbot = gr.Chatbot(
	value=[{
	'role':'System',
	'content':f"The User tries to guess a word. The User asks you questions about the word and you answer those questions. Try to help the user to find the word by giving very short descriptions. THE WORD TO GUESS IS: {RANDOM_WORD}"
	}],
	bubble_full_width=False,
	type="messages"
	)


	audio_input = gr.Audio(
	sources=['microphone'],
	interactive=True,
	scale=8
	)

	status = gr.Markdown()

	with gr.Accordion(label="Settings", open=False):

	asr_model_choice = gr.Radio(
	label="Select ASR Model",
	choices=["openai/whisper-large-v2","openai/whisper-tiny.en"],
	value="openai/whisper-tiny.en"
	)
	asr_model_choice.change(load_model, asr_model_choice)

	# Event listener for when the audio recording stops
	audio_input.stop_recording(fn=chat, inputs=[audio_input, chatbot, asr_model_choice], outputs=[chatbot, status])

	app.launch()