Spaces:
Sleeping
Sleeping
import os | |
import speech_recognition as sr | |
from deep_translator import GoogleTranslator | |
import ffmpeg | |
import random | |
import string | |
# Constants | |
AUDIO_FILE_PATH = "audio/test-ph-3.m4a" | |
SOURCE_LANG = "fil-PH" | |
TARGET_LANG = "en" | |
def convert_audio_to_wav(input_audio_path, output_wav_path): | |
""" | |
Convert any audio format (like M4A) to WAV using ffmpeg and return the output WAV file path. | |
""" | |
try: | |
ffmpeg.input(input_audio_path).output(output_wav_path, acodec='pcm_s16le', ar=44100).run() | |
print(f"Audio successfully converted to WAV: {output_wav_path}") | |
return output_wav_path | |
except ffmpeg.Error as e: | |
print(f"Error converting {input_audio_path} to WAV: {e}") | |
return None | |
except Exception as e: | |
print(f"Error converting {input_audio_path} to WAV: {e}") | |
return None | |
def recognize_speech_from_wav(model, wav_file_path, source_lang): | |
""" | |
Recognize speech from a WAV file using the Whisper recognition model. | |
""" | |
recognizer = sr.Recognizer() | |
with sr.AudioFile(wav_file_path) as source: | |
try: | |
#print('Transcribing audio to text...') | |
recognizer.adjust_for_ambient_noise(source) | |
audio_data = recognizer.record(source) | |
if model.lower() == "whisper": | |
text = recognizer.recognize_whisper(audio_data, language=source_lang) | |
elif model.lower() == "google": | |
text = recognizer.recognize_google(audio_data, language=source_lang) | |
else: | |
print(f"Invalid model name: {model}") | |
return None | |
return text | |
except sr.UnknownValueError: | |
print("Could not understand the audio.") | |
return None | |
except sr.RequestError as e: | |
print(f"Could not request results from the service; {e}") | |
return None | |
except Exception as e: | |
print(f"Could not request results from the service; {e}") | |
return None | |
def translate_text(text, target_lang): | |
""" | |
Translate the recognized text into the target language using Google Translator. | |
""" | |
try: | |
return GoogleTranslator(source='auto', target=target_lang).translate(text) | |
except Exception as e: | |
print(f"Error translating text: {e}") | |
return None | |
def process_audio_recognition(model="whisper", audio_path=None, source_lang="en", target_lang="en", translate=False): | |
""" | |
Main function to handle audio recognition and optional translation. | |
Converts the audio to WAV, recognizes speech, and optionally translates it. | |
""" | |
wav_file = audio_path | |
if wav_file and not wav_file.endswith(".wav"): | |
wav_file = convert_audio_to_wav(audio_path, ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)) + "converted_audio.wav") | |
if not wav_file: | |
print(f"Failed to process the audio file: {audio_path}") | |
return None | |
text = recognize_speech_from_wav(model, wav_file, source_lang) | |
if text: | |
# print(f"############# RECOGNIZED TEXT ({source_lang}) ##################") | |
# print(text) | |
# print("################################################") | |
if translate: | |
translated_text = translate_text(text, target_lang) | |
if translated_text: | |
# print(f"############# TRANSLATED TEXT ({target_lang}) ##################") | |
# print(translated_text) | |
# print("################################################") | |
text = translated_text | |
# Cleanup temporary WAV file | |
# try: | |
# #os.remove(wav_file) | |
# print(f"Temporary WAV file {wav_file} removed.") | |
# except OSError as e: | |
# print(f"Error removing temporary WAV file {wav_file}: {e}") | |
return text | |
if __name__ == '__main__': | |
process_audio_recognition(AUDIO_FILE_PATH, SOURCE_LANG, TARGET_LANG, translate=True) | |