import os import requests import speech_recognition as sr import difflib import gradio as gr from gtts import gTTS import io from pydub import AudioSegment import time from underthesea import phonetic # Create audio directory if it doesn't exist if not os.path.exists('audio'): os.makedirs('audio') # Step 1: Transcribe the audio file def transcribe_audio(audio): if audio is None: return "No audio file provided." recognizer = sr.Recognizer() # Check if the file exists if not os.path.isfile(audio): return "Audio file not found." audio_format = audio.split('.')[-1].lower() if audio_format != 'wav': try: audio_segment = AudioSegment.from_file(audio) wav_path = audio.replace(audio_format, 'wav') audio_segment.export(wav_path, format='wav') audio = wav_path except Exception as e: return f"Error converting audio: {e}" audio_file = sr.AudioFile(audio) with audio_file as source: audio_data = recognizer.record(source) try: transcription = recognizer.recognize_google(audio_data, language='vi-VN') # For Vietnamese return transcription except sr.UnknownValueError: return "Google Speech Recognition could not understand the audio" except sr.RequestError as e: return f"Error with Google Speech Recognition service: {e}" # Step 2: Create pronunciation audio for incorrect words (locally) def create_pronunciation_audio(word): try: tts = gTTS(word) audio_file_path = f"audio/{word}.mp3" tts.save(audio_file_path) return audio_file_path # Return the local path instead of uploading except Exception as e: return f"Failed to create pronunciation audio: {e}" # Upload function to Hugging Face Space def upfilepath(local_filename): ts = time.time() upload_url = f"https://mr2along-speech-recognize.hf.space/gradio_api/upload?upload_id={ts}" files = {'files': open(local_filename, 'rb')} try: response = requests.post(upload_url, files=files, timeout=30) # Set timeout (e.g., 30 seconds) if response.status_code == 200: result = response.json() extracted_path = result[0] return extracted_path else: return None except requests.exceptions.Timeout: return "Request timed out. Please try again." except Exception as e: return f"An error occurred: {e}" # Step 3: Compare the transcribed text with the input paragraph def compare_texts(reference_text, transcribed_text): reference_words = reference_text.split() transcribed_words = transcribed_text.split() incorrect_words_audios = [] # Store audio paths for incorrect words sm = difflib.SequenceMatcher(None, reference_text, transcribed_text) similarity_score = round(sm.ratio() * 100, 2) # Construct HTML output with detailed fidelity class html_output = f"Fidelity Class: " if similarity_score >= 85: html_output += f"GOOD (>=85%)
" elif similarity_score >= 70: html_output += f"ACCEPTABLE (70% - 85%)
" elif similarity_score >= 50: html_output += f"NEEDS IMPROVEMENT (50% - 70%)
" else: html_output += f"POOR (<50%)
" html_output += f"Quality Score: {similarity_score}%
" html_output += f"Transcribed Text: {transcribed_text}
" html_output += "Word Score List:
" # Generate colored word score list for i, word in enumerate(reference_words): try: if word.lower() == transcribed_words[i].lower(): html_output += f'{word} ' # Correct words in green elif difflib.get_close_matches(word, [transcribed_words[i]]): html_output += f'{word} ' # Close matches in yellow else: # Incorrect words in red html_output += f'{word} ' # Create pronunciation audio for the incorrect word audio_file_path = create_pronunciation_audio(word) incorrect_words_audios.append((word, audio_file_path)) except IndexError: # Word in reference that was not transcribed html_output += f'{word} ' # Provide audio for incorrect words if incorrect_words_audios: html_output += "
Pronunciation for Incorrect Words:
" for word, audio in incorrect_words_audios: suggestion = difflib.get_close_matches(word, reference_words, n=1) suggestion_text = f" (Did you mean: {suggestion[0]}?)" if suggestion else "" up_audio = upfilepath(audio) audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}" html_output += f'{word}: ' html_output += f'{suggestion_text}
' # Step 4: Vietnamese Phonetic Transcription phonetic_transcription = phonetic(reference_text) html_output += f"
Phonetic Transcription (Vietnamese): {phonetic_transcription}
" return [html_output] # Step 4: Text-to-Speech Function def text_to_speech(paragraph): if not paragraph: return None # Handle the case when no text is provided tts = gTTS(paragraph) audio_file_path = "audio/paragraph.mp3" # Save the audio to a file tts.save(audio_file_path) return audio_file_path # Return the file path # Gradio Interface Function def gradio_function(paragraph, audio): # Transcribe the audio transcribed_text = transcribe_audio(audio) # Compare the original paragraph with the transcribed text comparison_result = compare_texts(paragraph, transcribed_text) # Return comparison result return comparison_result # Gradio Interface using the updated API interface = gr.Interface( fn=gradio_function, inputs=[ gr.Textbox(lines=5, label="Input Paragraph"), gr.Audio(type="filepath", label="Record Audio") ], outputs=["html"], title="Speech Recognition Comparison with Phonetic Transcription", description="Input a paragraph, record your audio, and compare the transcription to the original text. Also, see phonetic transcription for Vietnamese." ) # Gradio Interface for Text-to-Speech tts_interface = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"), outputs=gr.Audio(label="Text-to-Speech Output"), title="Text-to-Speech", description="This tool will read your input paragraph aloud." ) # Combine both interfaces into one demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"]) # Launch Gradio app demo.launch()