import speech_recognition as sr import difflib import gradio as gr from gtts import gTTS import io # Step 1: Transcribe the audio file def transcribe_audio(audio): recognizer = sr.Recognizer() # Convert audio into recognizable format for the Recognizer audio_file = sr.AudioFile(audio) with audio_file as source: audio_data = recognizer.record(source) try: # Recognize the audio using Google Web Speech API transcription = recognizer.recognize_google(audio_data) return transcription except sr.UnknownValueError: return "Google Speech Recognition could not understand the audio" except sr.RequestError as e: return f"Error with Google Speech Recognition service: {e}" # Step 2: Create pronunciation audio for incorrect words def create_pronunciation_audio(word): tts = gTTS(word) audio_buffer = io.BytesIO() tts.save(audio_buffer) audio_buffer.seek(0) return audio_buffer # Step 3: Compare the transcribed text with the input paragraph def compare_texts(reference_text, transcribed_text): word_scores = [] reference_words = reference_text.split() transcribed_words = transcribed_text.split() incorrect_words_audios = [] # Store audio buffers for incorrect words sm = difflib.SequenceMatcher(None, reference_text, transcribed_text) similarity_score = round(sm.ratio() * 100, 2) # Construct HTML output html_output = f"Fidelity Class: {'CORRECT' if similarity_score > 50 else 'INCORRECT'}
" html_output += f"Quality Score: {similarity_score}
" html_output += f"Transcribed Text: {transcribed_text}
" html_output += "Word Score List:
" # Generate colored word score list for i, word in enumerate(reference_words): try: if word.lower() == transcribed_words[i].lower(): html_output += f'{word} ' # Correct words in green elif difflib.get_close_matches(word, transcribed_words): html_output += f'{word} ' # Close matches in yellow else: # Incorrect words in red html_output += f'{word} ' # Create pronunciation audio for the incorrect word audio_buffer = create_pronunciation_audio(word) incorrect_words_audios.append((word, audio_buffer)) except IndexError: html_output += f'{word} ' # Words in reference that were not transcribed # Provide audio for incorrect words if incorrect_words_audios: html_output += "
Pronunciation for Incorrect Words:
" for word, audio in incorrect_words_audios: html_output += f'{word}: ' # Return the audio buffer as part of the HTML output html_output += f'
' return html_output # Step 4: Text-to-Speech Function def text_to_speech(paragraph): tts = gTTS(paragraph) audio_buffer = io.BytesIO() tts.save(audio_buffer) audio_buffer.seek(0) return audio_buffer # Gradio Interface Function def gradio_function(paragraph, audio): # Transcribe the audio transcribed_text = transcribe_audio(audio) # Compare the original paragraph with the transcribed text comparison_result = compare_texts(paragraph, transcribed_text) # Return comparison result return comparison_result # Gradio Interface using the updated API interface = gr.Interface( fn=gradio_function, inputs=[ gr.Textbox(lines=5, label="Input Paragraph"), gr.Audio(type="filepath", label="Record Audio") ], outputs="html", title="Speech Recognition Comparison", description="Input a paragraph, record your audio, and compare the transcription to the original text." ) # Gradio Interface for Text-to-Speech tts_interface = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"), outputs=gr.Audio(label="Text-to-Speech Output"), title="Text-to-Speech", description="This tool will read your input paragraph aloud." ) # Combine both interfaces into one demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"]) # Launch Gradio app demo.launch()