Whisper_Swedish / app.py
DurreSudoku's picture
Upload app.py
1e7d3ee
raw
history blame
3.32 kB
import gradio as gr
from PIL import Image
import os
import random
from transformers import pipeline
from difflib import SequenceMatcher
all_images = os.listdir("assets")
current_image = None
pipe = pipeline(model="DurreSudoku/whisper-small-sv") # change to "your-username/the-name-you-picked"
def test_func():
random_int = random.randint(1, 100)
string = "Test successful" + str(random_int)
return string
def empty_string():
return ""
def open_image():
# Open a random image
global all_images
global current_image
if len(all_images) == 0:
all_images = os.listdir("assets")
img_name = random.choice(all_images)
all_images.remove(img_name)
current_image = img_name
img = Image.open(os.path.join(r"assets", img_name))
# print(img.filename)
return img
def transcribe(audio):
# Transcribe the audio and split the string into a list of words
try:
transcribed_audio = pipe(audio)["text"]
except:
return "Encountered an error. Are you sure that you recorded audio before submitting?"
transcribed_audio = transcribed_audio.replace(",", "")
transcribed_audio = transcribed_audio.replace(".", "")
transcribed_audio = transcribed_audio.replace("!", "")
transcribed_audio = transcribed_audio.replace("?", "")
transcribed_audio = transcribed_audio.lower()
text_list = transcribed_audio.split(" ")
correct_answer = current_image.split(".png")[0]
# Check for a perfect match.
if correct_answer in text_list:
return f"Correct! The answer is {correct_answer}."
# Check for partial match, in case the model mistakes a letter or two.
for text in text_list:
match_ratio = SequenceMatcher(None, text, correct_answer).ratio()
if match_ratio > 0.8:
return f"Partially correct. The answer is {correct_answer}. I heard {text}."
# If no match is found.
return f"Incorrect. The correct answer is {correct_answer}. I heard {transcribed_audio}."
with gr.Blocks(title="Interactive Language Learning") as demo:
with gr.Row():
gr.Markdown(
"""
# Interactive Language Learning Prototype
Hello!
This is a prototype app that is meant to help you learn some basic Swedish words. Observe the image,
record a one word answer and press the "Submit Answer" button! For a new image, press the "New Image" button.
""")
with gr.Row():
with gr.Column():
audio = gr.Audio(sources="microphone", type="filepath", label="Record your answer here")
with gr.Column():
image = gr.Image(value=open_image(),type="pil", interactive=False)
with gr.Row():
answer_box = gr.Text(label="Answer appears here", interactive=False)
with gr.Row():
with gr.Column():
process_input = gr.Button("Submit Answer")
process_input.click(fn=transcribe, inputs=audio, outputs=answer_box)
# process_input.click(fn=test_func, inputs=None, outputs=answer_box)
with gr.Column():
refresh = gr.Button("New Image")
refresh.click(fn=open_image, inputs=None, outputs=image)
refresh.click(fn=empty_string, inputs=None, outputs=answer_box)
demo.launch(debug=True)