import whisper import numpy as np from pydub import AudioSegment from deep_translator import GoogleTranslator, detection import os available_languages = GoogleTranslator().get_supported_languages(as_dict=True) formatted_languages = {key.title(): value for key, value in available_languages.items()} formatted_codes = {value: key.title() for key, value in available_languages.items()} lang_detect_key = os.getenv("detect_language_api_key") def audio_to_numpy(audio_file_input): audio = AudioSegment.from_file(audio_file_input) audio = audio.set_channels(1).set_frame_rate(16000) samples = np.array(audio.get_array_of_samples(), dtype=np.float32) return samples / np.iinfo(audio.array_type).max def src_audio_to_eng_translator(audio_file_input, model_size = "turbo", target_lang = "English"): audio_data = audio_to_numpy(audio_file_input) model = whisper.load_model(model_size) result = model.transcribe(audio_data) input_text = result["text"] src_lang_code = detection.single_detection(input_text, api_key = lang_detect_key) src_lang = formatted_codes.get(src_lang_code, 'Source language not detected') target_lang_code = formatted_languages.get(target_lang, 'en') translated_text = GoogleTranslator(source='auto', target=target_lang_code).translate(input_text) return input_text, translated_text, src_lang