File size: 7,154 Bytes
46e19ef
0d0b31f
d558c26
 
 
85d956d
0d0b31f
3b3baf1
0d0b31f
46e19ef
 
 
2afc1aa
8ef9310
 
0f433ab
 
 
d558c26
3e9568e
 
 
 
 
 
 
 
 
 
 
 
d558c26
8ef9310
0498d1c
3e9568e
8ef9310
 
 
 
 
 
 
 
 
 
 
 
91a2ea1
75a05dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91a2ea1
443bdb5
3e9568e
0d0b31f
75a05dd
0f433ab
290e8e0
75a05dd
145903a
 
91a2ea1
 
d558c26
 
 
0f433ab
d558c26
 
 
 
37a3491
859be22
 
f634650
859be22
f634650
859be22
f634650
859be22
f634650
37a3491
3b3baf1
61c9f90
 
 
 
d558c26
3e9568e
d558c26
61c9f90
37a3491
61c9f90
d558c26
91a2ea1
 
 
0f433ab
40cb73b
3e9568e
37a3491
 
61c9f90
9586c71
 
91a2ea1
37a3491
 
 
 
 
0d0b31f
37a3491
d558c26
91a2ea1
85d956d
290e8e0
 
79b4e39
3e9568e
290e8e0
 
 
85d956d
d558c26
8ef9310
d558c26
8ef9310
d558c26
 
 
 
 
0d0b31f
58f3405
d558c26
 
8ef9310
58f3405
0498d1c
8ef9310
0d0b31f
d558c26
 
 
 
85d956d
 
 
 
0d0b31f
85d956d
 
 
 
 
 
8a9dc3b
d558c26
0d0b31f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
import requests
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io
from pydub import AudioSegment
import time
# Create audio directory if it doesn't exist
if not os.path.exists('audio'):
    os.makedirs('audio')

# Step 1: Transcribe the audio file
def transcribe_audio(audio):
    if audio is None:
        return "No audio file provided."  # Handle the case when no audio is uploaded
    
    recognizer = sr.Recognizer()
    audio_format = audio.split('.')[-1].lower()
    
    # Convert to WAV if the audio is not in a supported format
    if audio_format != 'wav':
        try:
            # Load the audio file with pydub
            audio_segment = AudioSegment.from_file(audio)
            wav_path = audio.replace(audio_format, 'wav')
            audio_segment.export(wav_path, format='wav')  # Convert to WAV
            audio = wav_path  # Update audio path to the converted file
        except Exception as e:
            return f"Error converting audio: {e}"

    # Convert audio into recognizable format for the Recognizer
    audio_file = sr.AudioFile(audio)
    
    with audio_file as source:
        audio_data = recognizer.record(source)

    try:
        # Recognize the audio using Google Web Speech API
        transcription = recognizer.recognize_google(audio_data)
        return transcription
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Error with Google Speech Recognition service: {e}"

# Step 2: Create pronunciation audio for incorrect words
def upfilepath(local_filename):

    # URL để upload tệp âm thanh
    upload_url = "https://mr2along-speech-recognize.hf.space/gradio_api/upload?upload_id=yw08d344te"
    # Dữ liệu tệp cần upload
    files = {'files': open(local_filename, 'rb')}
    
    # Gửi yêu cầu POST
    response = requests.post(upload_url, files=files)
    
    # Kiểm tra kết quả trả về từ server
    if response.status_code == 200:
        print("Upload thành công!")
        result=response.json()
        extracted_path = result[0]
        print(extracted_path)  # In kết quả nếu server trả về dưới dạng JSON
        return extracted_path
    else:
        print(f"Lỗi: {response.status_code}")
        print(response.text)  # In thông báo lỗi từ server
        
def create_pronunciation_audio(word):
    time.sleep(1)  # Chờ 5 giây
    tts = gTTS(word)
    main_url="https://mr2along-speech-recognize.hf.space/gradio_api/file="
    
    audio_file_path = f"audio/{word}.mp3"  # Save the audio to a file
    tts.save(audio_file_path)
    word_audio=upfilepath(audio_file_path)
    #print(f"Lỗi: {word_audio}")
    return f"{main_url}{word_audio}"  # Return the file path of the saved audio

# Step 3: Compare the transcribed text with the input paragraph
def compare_texts(reference_text, transcribed_text):
    reference_words = reference_text.split()
    transcribed_words = transcribed_text.split()
    incorrect_words_audios = []  # Store audio paths for incorrect words

    sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
    similarity_score = round(sm.ratio() * 100, 2)

    # Construct HTML output with detailed fidelity class
    html_output = f"<strong>Fidelity Class:</strong> "
    if similarity_score >= 85:
        html_output += f"<strong>GOOD (>=85%)</strong><br>"
    elif similarity_score >= 70:
        html_output += f"<strong>ACCEPTABLE (70% - 85%)</strong><br>"
    elif similarity_score >= 50:
        html_output += f"<strong>NEEDS IMPROVEMENT (50% - 70%)</strong><br>"
    else:
        html_output += f"<strong>POOR (<50%)</strong><br>"

    html_output += f"<strong>Quality Score:</strong> {similarity_score}%<br>"
    html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
    html_output += "<strong>Word Score List:</strong><br>"

    # Generate colored word score list
    for i, word in enumerate(reference_words):
        try:
            if word.lower() == transcribed_words[i].lower():
                html_output += f'<span style="color: green;">{word}</span> '  # Correct words in green
            elif difflib.get_close_matches(word, [transcribed_words[i]]):
                html_output += f'<span style="color: yellow;">{word}</span> '  # Close matches in yellow
            else:
                # Incorrect words in red
                html_output += f'<span style="color: red;">{word}</span> '
                # Create pronunciation audio for the incorrect word
                audio_file_path = create_pronunciation_audio(word)
                incorrect_words_audios.append((word, audio_file_path))
        except IndexError:
            # Word in reference that was not transcribed
            html_output += f'<span style="color: red;">{word}</span> '

    # Provide audio for incorrect words
    if incorrect_words_audios:
        html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
        for word, audio in incorrect_words_audios:
            suggestion = difflib.get_close_matches(word, reference_words, n=1)
            suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
            html_output += f'{word}: '
            html_output += f'<audio controls><source src="{audio}" type="audio/wav">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'

    return [html_output, [audio for _, audio in incorrect_words_audios]]

# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
    if not paragraph:
        return None  # Handle the case when no text is provided
    
    tts = gTTS(paragraph)
    audio_file_path = "audio/paragraph.mp3"  # Save the audio to a file
    tts.save(audio_file_path)
    return audio_file_path  # Return the file path instead of None

# Gradio Interface Function
def gradio_function(paragraph, audio):
    # Transcribe the audio
    transcribed_text = transcribe_audio(audio)
    # Compare the original paragraph with the transcribed text
    comparison_result = compare_texts(paragraph, transcribed_text)

    # Return comparison result
    return comparison_result
    
# Gradio Interface using the updated API
interface = gr.Interface(
    fn=gradio_function, 
    inputs=[
        gr.Textbox(lines=5, label="Input Paragraph"),
        gr.Audio(type="filepath", label="Record Audio")
    ], 
    outputs=["html","files"],
    title="Speech Recognition Comparison",
    description="Input a paragraph, record your audio, and compare the transcription to the original text."
)

# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
    outputs=gr.Audio(label="Text-to-Speech Output"),
    title="Text-to-Speech",
    description="This tool will read your input paragraph aloud."
)

# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

# Launch Gradio app
demo.launch()