Spaces:

mboushaba
/

audio-transcriber

Sleeping

App Files Files Community

audio-transcriber / voice_recognition.py

mboushaba

Create voice_recognition.py

4c18362 verified 9 months ago

raw

history blame contribute delete

3.95 kB

	import os
	import speech_recognition as sr
	from deep_translator import GoogleTranslator
	import ffmpeg
	import random
	import string

	# Constants
	AUDIO_FILE_PATH = "audio/test-ph-3.m4a"
	SOURCE_LANG = "fil-PH"
	TARGET_LANG = "en"


	def convert_audio_to_wav(input_audio_path, output_wav_path):
	"""
	Convert any audio format (like M4A) to WAV using ffmpeg and return the output WAV file path.
	"""
	try:
	ffmpeg.input(input_audio_path).output(output_wav_path, acodec='pcm_s16le', ar=44100).run()
	print(f"Audio successfully converted to WAV: {output_wav_path}")
	return output_wav_path
	except ffmpeg.Error as e:
	print(f"Error converting {input_audio_path} to WAV: {e}")
	return None
	except Exception as e:
	print(f"Error converting {input_audio_path} to WAV: {e}")
	return None


	def recognize_speech_from_wav(model, wav_file_path, source_lang):
	"""
	Recognize speech from a WAV file using the Whisper recognition model.
	"""
	recognizer = sr.Recognizer()
	with sr.AudioFile(wav_file_path) as source:
	try:
	#print('Transcribing audio to text...')
	recognizer.adjust_for_ambient_noise(source)
	audio_data = recognizer.record(source)
	if model.lower() == "whisper":
	text = recognizer.recognize_whisper(audio_data, language=source_lang)
	elif model.lower() == "google":
	text = recognizer.recognize_google(audio_data, language=source_lang)
	else:
	print(f"Invalid model name: {model}")
	return None
	return text
	except sr.UnknownValueError:
	print("Could not understand the audio.")
	return None
	except sr.RequestError as e:
	print(f"Could not request results from the service; {e}")
	return None
	except Exception as e:
	print(f"Could not request results from the service; {e}")
	return None


	def translate_text(text, target_lang):
	"""
	Translate the recognized text into the target language using Google Translator.
	"""
	try:
	return GoogleTranslator(source='auto', target=target_lang).translate(text)
	except Exception as e:
	print(f"Error translating text: {e}")
	return None


	def process_audio_recognition(model="whisper", audio_path=None, source_lang="en", target_lang="en", translate=False):
	"""
	Main function to handle audio recognition and optional translation.
	Converts the audio to WAV, recognizes speech, and optionally translates it.
	"""
	wav_file = audio_path
	if wav_file and not wav_file.endswith(".wav"):
	wav_file = convert_audio_to_wav(audio_path, ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)) + "converted_audio.wav")

	if not wav_file:
	print(f"Failed to process the audio file: {audio_path}")
	return None

	text = recognize_speech_from_wav(model, wav_file, source_lang)
	if text:
	# print(f"############# RECOGNIZED TEXT ({source_lang}) ##################")
	# print(text)
	# print("################################################")

	if translate:
	translated_text = translate_text(text, target_lang)
	if translated_text:
	# print(f"############# TRANSLATED TEXT ({target_lang}) ##################")
	# print(translated_text)
	# print("################################################")
	text = translated_text

	# Cleanup temporary WAV file
	# try:
	# #os.remove(wav_file)
	# print(f"Temporary WAV file {wav_file} removed.")
	# except OSError as e:
	# print(f"Error removing temporary WAV file {wav_file}: {e}")

	return text


	if __name__ == '__main__':
	process_audio_recognition(AUDIO_FILE_PATH, SOURCE_LANG, TARGET_LANG, translate=True)