import os import speech_recognition as sr from deep_translator import GoogleTranslator import ffmpeg import random import string # Constants AUDIO_FILE_PATH = "audio/test-ph-3.m4a" SOURCE_LANG = "fil-PH" TARGET_LANG = "en" def convert_audio_to_wav(input_audio_path, output_wav_path): """ Convert any audio format (like M4A) to WAV using ffmpeg and return the output WAV file path. """ try: ffmpeg.input(input_audio_path).output(output_wav_path, acodec='pcm_s16le', ar=44100).run() print(f"Audio successfully converted to WAV: {output_wav_path}") return output_wav_path except ffmpeg.Error as e: print(f"Error converting {input_audio_path} to WAV: {e}") return None except Exception as e: print(f"Error converting {input_audio_path} to WAV: {e}") return None def recognize_speech_from_wav(model, wav_file_path, source_lang): """ Recognize speech from a WAV file using the Whisper recognition model. """ recognizer = sr.Recognizer() with sr.AudioFile(wav_file_path) as source: try: #print('Transcribing audio to text...') recognizer.adjust_for_ambient_noise(source) audio_data = recognizer.record(source) if model.lower() == "whisper": text = recognizer.recognize_whisper(audio_data, language=source_lang) elif model.lower() == "google": text = recognizer.recognize_google(audio_data, language=source_lang) else: print(f"Invalid model name: {model}") return None return text except sr.UnknownValueError: print("Could not understand the audio.") return None except sr.RequestError as e: print(f"Could not request results from the service; {e}") return None except Exception as e: print(f"Could not request results from the service; {e}") return None def translate_text(text, target_lang): """ Translate the recognized text into the target language using Google Translator. """ try: return GoogleTranslator(source='auto', target=target_lang).translate(text) except Exception as e: print(f"Error translating text: {e}") return None def process_audio_recognition(model="whisper", audio_path=None, source_lang="en", target_lang="en", translate=False): """ Main function to handle audio recognition and optional translation. Converts the audio to WAV, recognizes speech, and optionally translates it. """ wav_file = audio_path if wav_file and not wav_file.endswith(".wav"): wav_file = convert_audio_to_wav(audio_path, ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)) + "converted_audio.wav") if not wav_file: print(f"Failed to process the audio file: {audio_path}") return None text = recognize_speech_from_wav(model, wav_file, source_lang) if text: # print(f"############# RECOGNIZED TEXT ({source_lang}) ##################") # print(text) # print("################################################") if translate: translated_text = translate_text(text, target_lang) if translated_text: # print(f"############# TRANSLATED TEXT ({target_lang}) ##################") # print(translated_text) # print("################################################") text = translated_text # Cleanup temporary WAV file # try: # #os.remove(wav_file) # print(f"Temporary WAV file {wav_file} removed.") # except OSError as e: # print(f"Error removing temporary WAV file {wav_file}: {e}") return text if __name__ == '__main__': process_audio_recognition(AUDIO_FILE_PATH, SOURCE_LANG, TARGET_LANG, translate=True)