File size: 3,290 Bytes
87274a6
0f9125a
 
 
 
 
 
1e7d3ee
d555d0f
0f9125a
 
 
 
 
 
 
 
 
 
 
 
 
 
1e7d3ee
 
d555d0f
1e7d3ee
 
 
 
 
 
d555d0f
 
0f9125a
 
 
 
 
 
 
d555d0f
0f9125a
2236a27
1e7d3ee
2236a27
1e7d3ee
c58f484
 
 
 
 
a5ceb81
0f9125a
 
 
d555d0f
0f9125a
 
 
 
 
 
 
 
 
09dea42
 
0f9125a
09dea42
87274a6
 
0f9125a
 
 
 
 
 
 
 
 
 
 
 
 
c55ef1c
0f9125a
d555d0f
0f9125a
a5ceb81
0f9125a
 
 
d555d0f
 
0f9125a
 
 
 
1e7d3ee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gradio as gr
from PIL import Image
import os
import random
from transformers import pipeline
from difflib import SequenceMatcher

all_images = os.listdir("assets")
current_image = None
pipe = pipeline(model="DurreSudoku/whisper-small-sv")  # change to "your-username/the-name-you-picked"

def test_func():
    random_int = random.randint(1, 100)
    string = "Test successful" + str(random_int)
    return string


def empty_string():
    return ""


def open_image():
    # Open a random image
    global all_images
    global current_image

    if len(all_images) == 0:
        all_images = os.listdir("assets")
        
    img_name = random.choice(all_images)
    all_images.remove(img_name)
    
    current_image = img_name
    
    img = Image.open(os.path.join(r"assets", img_name))
    # print(img.filename)
    return img




def transcribe(audio):
    # Transcribe the audio and split the string into a list of words
    try:
        transcribed_audio = pipe(audio)["text"]
    except:
        return "Encountered an error. Are you sure that you recorded audio before submitting?"
    
    transcribed_audio = transcribed_audio.replace(",", "")
    transcribed_audio = transcribed_audio.replace(".", "")
    transcribed_audio = transcribed_audio.replace("!", "")
    transcribed_audio = transcribed_audio.replace("?", "")
    transcribed_audio = transcribed_audio.lower()
    
    text_list = transcribed_audio.split(" ")
    
    correct_answer = current_image.split(".png")[0]
    
    # Check for a perfect match.
    if correct_answer in text_list:
        return f"Correct! The answer is {correct_answer}."
    
    # Check for partial match, in case the model mistakes a letter or two.
    for text in text_list:
        match_ratio =  SequenceMatcher(None, text, correct_answer).ratio()
        
        if match_ratio >= 0.8:
            return f"The answer is {correct_answer}. I heard {text}."
    # If no match is found.
    return f"The correct answer is {correct_answer}. I heard {transcribed_audio}."


with gr.Blocks(title="Interactive Language Learning") as demo:
    with gr.Row():
        gr.Markdown(
    """
    # Interactive Language Learning Prototype
    
    Hello!
    
    This is a prototype app that is meant to help you learn some basic Swedish words. Observe the image, 
    record a one word answer and press the "Submit Answer" button! For a new image, press the "New Image" button.
    """)
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(sources="microphone", type="filepath", label="Record your answer here")
        with gr.Column():
            image = gr.Image(value=open_image(),type="pil", interactive=False)
    with gr.Row():
        answer_box = gr.Text(label="Answer appears here", interactive=False)
    with gr.Row():
        with gr.Column():
            process_input = gr.Button("Submit Answer")
            process_input.click(fn=transcribe, inputs=audio, outputs=answer_box)
            # process_input.click(fn=test_func, inputs=None, outputs=answer_box)
        with gr.Column():
            refresh = gr.Button("New Image")
            refresh.click(fn=open_image, inputs=None, outputs=image)
            refresh.click(fn=empty_string, inputs=None, outputs=answer_box)
demo.launch(debug=True)