import os import requests import speech_recognition as sr import difflib import gradio as gr from gtts import gTTS import io from pydub import AudioSegment import time import pronouncing import epitran # Create audio directory if it doesn't exist if not os.path.exists('audio'): os.makedirs('audio') # Initialize the epitran object for English try: epi = epitran.Epitran('eng-Latn') except Exception as e: print(f"Error initializing Epitran: {e}") # Step 1: Transcribe the audio file def transcribe_audio(audio): if audio is None: return "No audio file provided." recognizer = sr.Recognizer() # Check if the file exists if not os.path.isfile(audio): return "Audio file not found." audio_format = audio.split('.')[-1].lower() if audio_format != 'wav': try: audio_segment = AudioSegment.from_file(audio) wav_path = audio.replace(audio_format, 'wav') audio_segment.export(wav_path, format='wav') audio = wav_path except Exception as e: return f"Error converting audio: {e}" audio_file = sr.AudioFile(audio) with audio_file as source: audio_data = recognizer.record(source) try: transcription = recognizer.recognize_google(audio_data) return transcription except sr.UnknownValueError: return "Google Speech Recognition could not understand the audio" except sr.RequestError as e: return f"Error with Google Speech Recognition service: {e}" # Step 2: Create pronunciation audio for incorrect words (locally) def create_pronunciation_audio(word): try: tts = gTTS(word) audio_file_path = f"audio/{word}.mp3" tts.save(audio_file_path) return audio_file_path # Return the local path instead of uploading except Exception as e: return f"Failed to create pronunciation audio: {e}" # Function for phonetic respelling def phonetic_respelling(sentence): words = sentence.split() respelled = [] for word in words: # Find close matches for each word close_matches = pronouncing.search(word) if close_matches: # Get the first close match closest_word = close_matches[0] respelled.append(pronouncing.phones_for_word(closest_word)[0]) # Use phonemes for the closest match else: respelled.append(word) # Convert phonemes to respelling respelling = ' '.join(respelled) # Replace phonemes with common respellings respelling = respelling.replace('ˈ', '').replace('ˌ', '').replace('ː', '') # Clean up phoneme symbols respelling = respelling.replace('ɑ', 'a').replace('ə', 'uh').replace('ɪ', 'i').replace('ʊ', 'u') # Sample conversions return respelling # Function for IPA transcription def ipa_transcription(sentence): try: return epi.transliterate(sentence) except Exception as e: print(f"Error during IPA transcription: {e}") return "IPA transcription failed." # Step 3: Compare the transcribed text with the input paragraph def compare_texts(reference_text, transcribed_text): reference_words = reference_text.split() transcribed_words = transcribed_text.split() incorrect_words_audios = [] # Store audio paths for incorrect words sm = difflib.SequenceMatcher(None, reference_text, transcribed_text) similarity_score = round(sm.ratio() * 100, 2) # Construct HTML output with detailed fidelity class html_output = f"Fidelity Class: " if similarity_score >= 85: html_output += f"GOOD (>=85%)
" elif similarity_score >= 70: html_output += f"ACCEPTABLE (70% - 85%)
" elif similarity_score >= 50: html_output += f"NEEDS IMPROVEMENT (50% - 70%)
" else: html_output += f"POOR (<50%)
" html_output += f"Quality Score: {similarity_score}%
" html_output += f"Transcribed Text: {transcribed_text}
" html_output += f"Input Sentence: {reference_text}
" html_output += f"Phonetic Respelling: {phonetic_respelling(reference_text)}
" html_output += f"IPA Transcription: {ipa_transcription(reference_text)}
" html_output += "Word Score List:
" # Generate colored word score list for i, word in enumerate(reference_words): try: if word.lower() == transcribed_words[i].lower(): html_output += f'{word} ' # Correct words in green elif difflib.get_close_matches(word, [transcribed_words[i]]): html_output += f'{word} ' # Close matches in yellow else: # Incorrect words in red html_output += f'{word} ' # Create pronunciation audio for the incorrect word audio_file_path = create_pronunciation_audio(word) incorrect_words_audios.append((word, audio_file_path)) except IndexError: # Word in reference that was not transcribed html_output += f'{word} ' # Provide audio for incorrect words if incorrect_words_audios: html_output += "
Pronunciation for Incorrect Words:
" for word, audio in incorrect_words_audios: suggestion = difflib.get_close_matches(word, reference_words, n=1) suggestion_text = f" (Did you mean: {suggestion[0]}?)" if suggestion else "" up_audio = upfilepath(audio) audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}" html_output += f'{word}: ' html_output += f'{suggestion_text}
' return [html_output] # Step 4: Text-to-Speech Function def text_to_speech(paragraph): if not paragraph: return None # Handle the case when no text is provided tts = gTTS(paragraph) audio_file_path = "audio/paragraph.mp3" # Save the audio to a file tts.save(audio_file_path) return audio_file_path # Return the file path # Gradio Interface Function def gradio_function(paragraph, audio): # Transcribe the audio transcribed_text = transcribe_audio(audio) # Compare the original paragraph with the transcribed text comparison_result = compare_texts(paragraph, transcribed_text) # Return comparison result return comparison_result # Gradio Interface using the updated API interface = gr.Interface( fn=gradio_function, inputs=[ gr.Textbox(lines=5, label="Input Paragraph"), gr.Audio(type="filepath", label="Record Audio") ], outputs=["html"], title="Speech Recognition Comparison", description="Input a paragraph, record your audio, and compare the transcription to the original text." ) # Gradio Interface for Text-to-Speech tts_interface = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"), outputs=gr.Audio(label="Text-to-Speech Output"), title="Text-to-Speech", description="This tool will read your input paragraph aloud." ) # Combine both interfaces into one demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"]) # Launch Gradio app demo.launch()