File size: 4,615 Bytes
d558c26
 
 
85d956d
9586c71
d558c26
8ef9310
 
d558c26
 
8ef9310
0498d1c
8ef9310
 
 
 
 
 
 
 
 
 
 
 
 
91a2ea1
 
 
9586c71
 
 
 
91a2ea1
 
d558c26
 
 
 
9586c71
d558c26
 
 
 
61c9f90
 
 
 
 
 
 
d558c26
 
 
61c9f90
 
 
d558c26
91a2ea1
 
 
9586c71
 
d558c26
61c9f90
 
9586c71
 
91a2ea1
9586c71
 
 
 
91a2ea1
61c9f90
d558c26
91a2ea1
85d956d
 
9586c71
 
 
 
85d956d
d558c26
8ef9310
d558c26
8ef9310
d558c26
 
 
 
 
 
 
58f3405
d558c26
 
8ef9310
58f3405
0498d1c
8ef9310
61c9f90
d558c26
 
 
 
85d956d
 
 
 
9586c71
85d956d
 
 
 
 
 
 
d558c26
85d956d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io

# Step 1: Transcribe the audio file
def transcribe_audio(audio):
    recognizer = sr.Recognizer()

    # Convert audio into recognizable format for the Recognizer
    audio_file = sr.AudioFile(audio)
    
    with audio_file as source:
        audio_data = recognizer.record(source)

    try:
        # Recognize the audio using Google Web Speech API
        transcription = recognizer.recognize_google(audio_data)
        return transcription
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Error with Google Speech Recognition service: {e}"

# Step 2: Create pronunciation audio for incorrect words
def create_pronunciation_audio(word):
    tts = gTTS(word)
    audio_buffer = io.BytesIO()
    tts.save(audio_buffer)
    audio_buffer.seek(0)
    return audio_buffer

# Step 3: Compare the transcribed text with the input paragraph
def compare_texts(reference_text, transcribed_text):
    word_scores = []
    reference_words = reference_text.split()
    transcribed_words = transcribed_text.split()
    incorrect_words_audios = []  # Store audio buffers for incorrect words

    sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
    similarity_score = round(sm.ratio() * 100, 2)

    # Construct HTML output
    html_output = f"<strong>Fidelity Class:</strong> {'CORRECT' if similarity_score > 50 else 'INCORRECT'}<br>"
    html_output += f"<strong>Quality Score:</strong> {similarity_score}<br>"
    html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
    html_output += "<strong>Word Score List:</strong><br>"

    # Generate colored word score list
    for i, word in enumerate(reference_words):
        try:
            if word.lower() == transcribed_words[i].lower():
                html_output += f'<span style="color: green;">{word}</span> '  # Correct words in green
            elif difflib.get_close_matches(word, transcribed_words):
                html_output += f'<span style="color: yellow;">{word}</span> '  # Close matches in yellow
            else:
                # Incorrect words in red
                html_output += f'<span style="color: red;">{word}</span> '
                # Create pronunciation audio for the incorrect word
                audio_buffer = create_pronunciation_audio(word)
                incorrect_words_audios.append((word, audio_buffer))
        except IndexError:
            html_output += f'<span style="color: red;">{word}</span> '  # Words in reference that were not transcribed

    # Provide audio for incorrect words
    if incorrect_words_audios:
        html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
        for word, audio in incorrect_words_audios:
            html_output += f'{word}: '
            # Return the audio buffer as part of the HTML output
            html_output += f'<audio controls><source src="data:audio/mp3;base64,{audio.getvalue().decode()}" type="audio/mpeg">Your browser does not support the audio tag.</audio><br>'

    return html_output

# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
    tts = gTTS(paragraph)
    audio_buffer = io.BytesIO()
    tts.save(audio_buffer)
    audio_buffer.seek(0)
    return audio_buffer

# Gradio Interface Function
def gradio_function(paragraph, audio):
    # Transcribe the audio
    transcribed_text = transcribe_audio(audio)

    # Compare the original paragraph with the transcribed text
    comparison_result = compare_texts(paragraph, transcribed_text)

    # Return comparison result
    return comparison_result

# Gradio Interface using the updated API
interface = gr.Interface(
    fn=gradio_function, 
    inputs=[
        gr.Textbox(lines=5, label="Input Paragraph"),
        gr.Audio(type="filepath", label="Record Audio")
    ], 
    outputs="html",
    title="Speech Recognition Comparison",
    description="Input a paragraph, record your audio, and compare the transcription to the original text."
)

# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
    outputs=gr.Audio(label="Text-to-Speech Output"),
    title="Text-to-Speech",
    description="This tool will read your input paragraph aloud."
)

# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

# Launch Gradio app
demo.launch()