import os import speech_recognition as sr import difflib import gradio as gr from gtts import gTTS import io from pydub import AudioSegment import time # Create audio directory if it doesn't exist if not os.path.exists('audio'): os.makedirs('audio') # Step 1: Transcribe the audio file def transcribe_audio(audio): if audio is None: return "No audio file provided." # Handle the case when no audio is uploaded recognizer = sr.Recognizer() audio_format = audio.split('.')[-1].lower() # Convert to WAV if the audio is not in a supported format if audio_format != 'wav': try: # Load the audio file with pydub audio_segment = AudioSegment.from_file(audio) wav_path = audio.replace(audio_format, 'wav') audio_segment.export(wav_path, format='wav') # Convert to WAV audio = wav_path # Update audio path to the converted file except Exception as e: return f"Error converting audio: {e}" # Convert audio into recognizable format for the Recognizer audio_file = sr.AudioFile(audio) with audio_file as source: audio_data = recognizer.record(source) try: # Recognize the audio using Google Web Speech API transcription = recognizer.recognize_google(audio_data) return transcription except sr.UnknownValueError: return "Google Speech Recognition could not understand the audio" except sr.RequestError as e: return f"Error with Google Speech Recognition service: {e}" # Step 2: Create pronunciation audio for incorrect words def create_pronunciation_audio(word): time.sleep(5) # Chờ 5 giây tts = gTTS(word) audio_file_path = f"audio/{word}.mp3" # Save the audio to a file tts.save(audio_file_path) print(f"audio/{word}: {audio_file_path}") return f"https://mr2along-speech-recognize.hf.space/gradio_api/file=/tmp/gradio/fdecb56cb7306418314e505dff8775673ca46761fc0edf3a891ab99293292055/{audio_file_path}" # Return the file path of the saved audio # Step 3: Compare the transcribed text with the input paragraph def compare_texts(reference_text, transcribed_text): reference_words = reference_text.split() transcribed_words = transcribed_text.split() incorrect_words_audios = [] # Store audio paths for incorrect words sm = difflib.SequenceMatcher(None, reference_text, transcribed_text) similarity_score = round(sm.ratio() * 100, 2) # Construct HTML output html_output = f"Fidelity Class: {'CORRECT' if similarity_score > 50 else 'INCORRECT'}
" html_output += f"Quality Score: {similarity_score}
" html_output += f"Transcribed Text: {transcribed_text}
" html_output += "Word Score List:
" # Generate colored word score list for i, word in enumerate(reference_words): try: if word.lower() == transcribed_words[i].lower(): html_output += f'{word} ' # Correct words in green elif difflib.get_close_matches(word, transcribed_words): html_output += f'{word} ' # Close matches in yellow else: # Incorrect words in red html_output += f'{word} ' # Create pronunciation audio for the incorrect word audio_file_path = create_pronunciation_audio(word) incorrect_words_audios.append((word, audio_file_path)) except IndexError: html_output += f'{word} ' # Words in reference that were not transcribed # Provide audio for incorrect words if incorrect_words_audios: html_output += "
Pronunciation for Incorrect Words:
" for word, audio in incorrect_words_audios: suggestion = difflib.get_close_matches(word, reference_words, n=1) suggestion_text = f" (Did you mean: {suggestion[0]}?)" if suggestion else "" html_output += f'{word}: ' html_output += f'{suggestion_text}
' return html_output # Step 4: Text-to-Speech Function def text_to_speech(paragraph): if not paragraph: return None # Handle the case when no text is provided tts = gTTS(paragraph) audio_file_path = "audio/paragraph.mp3" # Save the audio to a file tts.save(audio_file_path) return audio_file_path # Return the file path instead of None # Gradio Interface Function def gradio_function(paragraph, audio): # Transcribe the audio transcribed_text = transcribe_audio(audio) # Compare the original paragraph with the transcribed text comparison_result = compare_texts(paragraph, transcribed_text) # Return comparison result return comparison_result # Gradio Interface using the updated API interface = gr.Interface( fn=gradio_function, inputs=[ gr.Textbox(lines=5, label="Input Paragraph"), gr.Audio(type="filepath", label="Record Audio") ], outputs="html", title="Speech Recognition Comparison", description="Input a paragraph, record your audio, and compare the transcription to the original text." ) # Gradio Interface for Text-to-Speech tts_interface = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"), outputs=gr.Audio(label="Text-to-Speech Output"), title="Text-to-Speech", description="This tool will read your input paragraph aloud." ) # Combine both interfaces into one demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"]) # Launch Gradio app demo.launch()