Spaces:

DurreSudoku
/

Whisper_Swedish

Sleeping

App Files Files

Whisper_Swedish / app.py

DurreSudoku

Update app.py

61aa5a5 verified 4 months ago

raw

history blame

3.53 kB

	import gradio as gr
	from PIL import Image
	import os
	import random
	from transformers import pipeline
	from difflib import SequenceMatcher
	import logging

	all_images = os.listdir("assets")
	current_image = None
	pipe = pipeline(task="automatic-speech-recognition", model="DurreSudoku/whisper-small-sv") # change to "your-username/the-name-you-picked"

	def test_func():
	random_int = random.randint(1, 100)
	string = "Test successful" + str(random_int)
	return string


	def empty_string():
	return ""


	def open_image():
	# Open a random image
	global all_images
	global current_image

	if len(all_images) == 0:
	all_images = os.listdir("assets")

	img_name = random.choice(all_images)
	all_images.remove(img_name)

	current_image = img_name

	img = Image.open(os.path.join(r"assets", img_name))
	# print(img.filename)
	return img




	def transcribe(audio_input):

	# Transcribe the audio and split the string into a list of words
	try:
	transcribed_audio = pipe(audio_input)["text"]
	except Exception as e:
	logging.exception(e)
	return "Encountered an error. Are you sure that you recorded audio before submitting?"

	transcribed_audio = transcribed_audio.replace(",", "")
	transcribed_audio = transcribed_audio.replace(".", "")
	transcribed_audio = transcribed_audio.replace("!", "")
	transcribed_audio = transcribed_audio.replace("?", "")
	transcribed_audio = transcribed_audio.lower()



	correct_answer = current_image.split(".png")[0]
	text_list = transcribed_audio.split(" ")
	ratio = SequenceMatcher(None, transcribed_audio, correct_answer).ratio()

	if ratio >= 0.75:
	return f"Correct! The answer is {correct_answer}."
	elif correct_answer in text_list:
	return f"Correct! The answer is {correct_answer}."

	# Check for partial match, in case the model mistakes a letter or two.
	for text in text_list:
	match_ratio = SequenceMatcher(None, text, correct_answer).ratio()

	if match_ratio >= 0.75:
	return f"The answer is {correct_answer}. I heard {text}."
	# If no match is found.
	return f"The correct answer is {correct_answer}. I heard {transcribed_audio}."


	with gr.Blocks(title="Interactive Language Learning") as demo:
	with gr.Row():
	gr.Markdown(
	"""
	# Interactive Language Learning Prototype

	Hello!

	This is a prototype app that is meant to help you learn some basic Swedish words. Observe the image,
	record a one word answer and press the "Submit Answer" button! For a new image, press the "New Image" button.
	""")
	with gr.Row():
	with gr.Column():
	audio = gr.Audio(sources="microphone", type="filepath", label="Record your answer here")
	with gr.Column():
	image = gr.Image(value=open_image(),type="pil", interactive=False)
	with gr.Row():
	answer_box = gr.Text(label="Answer appears here", interactive=False)
	with gr.Row():
	with gr.Column():
	process_input = gr.Button("Submit Answer")
	process_input.click(fn=transcribe, inputs=audio, outputs=answer_box)
	# process_input.click(fn=test_func, inputs=None, outputs=answer_box)
	with gr.Column():
	refresh = gr.Button("New Image")
	refresh.click(fn=open_image, inputs=None, outputs=image)
	refresh.click(fn=empty_string, inputs=None, outputs=answer_box)
	demo.launch(debug=True)