Spaces:

DurreSudoku
/

Whisper_Swedish

Sleeping

File size: 3,532 Bytes

87274a6
0f9125a
 
 
 
 
87b4ae0
0f9125a
1e7d3ee
d555d0f
d5e5cc3
0f9125a
 
 
 
 
 
 
 
 
 
 
 
 
1e7d3ee
 
d555d0f
1e7d3ee
 
 
 
 
 
d555d0f
 
0f9125a
 
 
 
 
 
 
f993f54
61aa5a5
0f9125a
2236a27
61aa5a5
87b4ae0
 
1e7d3ee
c58f484
 
 
 
 
a5ceb81
0f9125a
d5e5cc3
0f9125a
d555d0f
d5e5cc3
 
0f9125a
d5e5cc3
 
 
0f9125a
 
 
 
 
 
1ff72eb
09dea42
0f9125a
09dea42
87274a6
 
0f9125a
 
 
 
 
 
 
 
 
 
 
 
 
d5e5cc3
0f9125a
d555d0f
0f9125a
a5ceb81
0f9125a
 
 
d555d0f
 
0f9125a
 
 
 
1e7d3ee

import gradio as gr
from PIL import Image
import os
import random
from transformers import pipeline
from difflib import SequenceMatcher
import logging

all_images = os.listdir("assets")
current_image = None
pipe = pipeline(task="automatic-speech-recognition", model="DurreSudoku/whisper-small-sv")  # change to "your-username/the-name-you-picked"

def test_func():
    random_int = random.randint(1, 100)
    string = "Test successful" + str(random_int)
    return string


def empty_string():
    return ""


def open_image():
    # Open a random image
    global all_images
    global current_image

    if len(all_images) == 0:
        all_images = os.listdir("assets")
        
    img_name = random.choice(all_images)
    all_images.remove(img_name)
    
    current_image = img_name
    
    img = Image.open(os.path.join(r"assets", img_name))
    # print(img.filename)
    return img




def transcribe(audio_input):

    # Transcribe the audio and split the string into a list of words
    try:
        transcribed_audio = pipe(audio_input)["text"]
    except Exception as e:
        logging.exception(e)
        return "Encountered an error. Are you sure that you recorded audio before submitting?"
    
    transcribed_audio = transcribed_audio.replace(",", "")
    transcribed_audio = transcribed_audio.replace(".", "")
    transcribed_audio = transcribed_audio.replace("!", "")
    transcribed_audio = transcribed_audio.replace("?", "")
    transcribed_audio = transcribed_audio.lower()
    
    
    
    correct_answer = current_image.split(".png")[0]
    text_list = transcribed_audio.split(" ")
    ratio = SequenceMatcher(None, transcribed_audio, correct_answer).ratio()
    
    if ratio >= 0.75:
        return f"Correct! The answer is {correct_answer}."
    elif correct_answer in text_list:
        return f"Correct! The answer is {correct_answer}."
    
    # Check for partial match, in case the model mistakes a letter or two.
    for text in text_list:
        match_ratio =  SequenceMatcher(None, text, correct_answer).ratio()
        
        if match_ratio >= 0.75:
            return f"The answer is {correct_answer}. I heard {text}."
    # If no match is found.
    return f"The correct answer is {correct_answer}. I heard {transcribed_audio}."


with gr.Blocks(title="Interactive Language Learning") as demo:
    with gr.Row():
        gr.Markdown(
    """
    # Interactive Language Learning Prototype
    
    Hello!
    
    This is a prototype app that is meant to help you learn some basic Swedish words. Observe the image, 
    record a one word answer and press the "Submit Answer" button! For a new image, press the "New Image" button.
    """)
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(sources="microphone", type="filepath", label="Record your answer here")
        with gr.Column():
            image = gr.Image(value=open_image(),type="pil", interactive=False)
    with gr.Row():
        answer_box = gr.Text(label="Answer appears here", interactive=False)
    with gr.Row():
        with gr.Column():
            process_input = gr.Button("Submit Answer")
            process_input.click(fn=transcribe, inputs=audio, outputs=answer_box)
            # process_input.click(fn=test_func, inputs=None, outputs=answer_box)
        with gr.Column():
            refresh = gr.Button("New Image")
            refresh.click(fn=open_image, inputs=None, outputs=image)
            refresh.click(fn=empty_string, inputs=None, outputs=answer_box)
demo.launch(debug=True)