Spaces:
Sleeping
Sleeping
Create voice_recognition.py
Browse files- voice_recognition.py +108 -0
voice_recognition.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import speech_recognition as sr
|
3 |
+
from deep_translator import GoogleTranslator
|
4 |
+
import ffmpeg
|
5 |
+
import random
|
6 |
+
import string
|
7 |
+
|
8 |
+
# Constants
|
9 |
+
AUDIO_FILE_PATH = "audio/test-ph-3.m4a"
|
10 |
+
SOURCE_LANG = "fil-PH"
|
11 |
+
TARGET_LANG = "en"
|
12 |
+
|
13 |
+
|
14 |
+
def convert_audio_to_wav(input_audio_path, output_wav_path):
|
15 |
+
"""
|
16 |
+
Convert any audio format (like M4A) to WAV using ffmpeg and return the output WAV file path.
|
17 |
+
"""
|
18 |
+
try:
|
19 |
+
ffmpeg.input(input_audio_path).output(output_wav_path, acodec='pcm_s16le', ar=44100).run()
|
20 |
+
print(f"Audio successfully converted to WAV: {output_wav_path}")
|
21 |
+
return output_wav_path
|
22 |
+
except ffmpeg.Error as e:
|
23 |
+
print(f"Error converting {input_audio_path} to WAV: {e}")
|
24 |
+
return None
|
25 |
+
except Exception as e:
|
26 |
+
print(f"Error converting {input_audio_path} to WAV: {e}")
|
27 |
+
return None
|
28 |
+
|
29 |
+
|
30 |
+
def recognize_speech_from_wav(model, wav_file_path, source_lang):
|
31 |
+
"""
|
32 |
+
Recognize speech from a WAV file using the Whisper recognition model.
|
33 |
+
"""
|
34 |
+
recognizer = sr.Recognizer()
|
35 |
+
with sr.AudioFile(wav_file_path) as source:
|
36 |
+
try:
|
37 |
+
#print('Transcribing audio to text...')
|
38 |
+
recognizer.adjust_for_ambient_noise(source)
|
39 |
+
audio_data = recognizer.record(source)
|
40 |
+
if model.lower() == "whisper":
|
41 |
+
text = recognizer.recognize_whisper(audio_data, language=source_lang)
|
42 |
+
elif model.lower() == "google":
|
43 |
+
text = recognizer.recognize_google(audio_data, language=source_lang)
|
44 |
+
else:
|
45 |
+
print(f"Invalid model name: {model}")
|
46 |
+
return None
|
47 |
+
return text
|
48 |
+
except sr.UnknownValueError:
|
49 |
+
print("Could not understand the audio.")
|
50 |
+
return None
|
51 |
+
except sr.RequestError as e:
|
52 |
+
print(f"Could not request results from the service; {e}")
|
53 |
+
return None
|
54 |
+
except Exception as e:
|
55 |
+
print(f"Could not request results from the service; {e}")
|
56 |
+
return None
|
57 |
+
|
58 |
+
|
59 |
+
def translate_text(text, target_lang):
|
60 |
+
"""
|
61 |
+
Translate the recognized text into the target language using Google Translator.
|
62 |
+
"""
|
63 |
+
try:
|
64 |
+
return GoogleTranslator(source='auto', target=target_lang).translate(text)
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Error translating text: {e}")
|
67 |
+
return None
|
68 |
+
|
69 |
+
|
70 |
+
def process_audio_recognition(model="whisper", audio_path=None, source_lang="en", target_lang="en", translate=False):
|
71 |
+
"""
|
72 |
+
Main function to handle audio recognition and optional translation.
|
73 |
+
Converts the audio to WAV, recognizes speech, and optionally translates it.
|
74 |
+
"""
|
75 |
+
wav_file = audio_path
|
76 |
+
if wav_file and not wav_file.endswith(".wav"):
|
77 |
+
wav_file = convert_audio_to_wav(audio_path, ''.join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)) + "converted_audio.wav")
|
78 |
+
|
79 |
+
if not wav_file:
|
80 |
+
print(f"Failed to process the audio file: {audio_path}")
|
81 |
+
return None
|
82 |
+
|
83 |
+
text = recognize_speech_from_wav(model, wav_file, source_lang)
|
84 |
+
if text:
|
85 |
+
# print(f"############# RECOGNIZED TEXT ({source_lang}) ##################")
|
86 |
+
# print(text)
|
87 |
+
# print("################################################")
|
88 |
+
|
89 |
+
if translate:
|
90 |
+
translated_text = translate_text(text, target_lang)
|
91 |
+
if translated_text:
|
92 |
+
# print(f"############# TRANSLATED TEXT ({target_lang}) ##################")
|
93 |
+
# print(translated_text)
|
94 |
+
# print("################################################")
|
95 |
+
text = translated_text
|
96 |
+
|
97 |
+
# Cleanup temporary WAV file
|
98 |
+
# try:
|
99 |
+
# #os.remove(wav_file)
|
100 |
+
# print(f"Temporary WAV file {wav_file} removed.")
|
101 |
+
# except OSError as e:
|
102 |
+
# print(f"Error removing temporary WAV file {wav_file}: {e}")
|
103 |
+
|
104 |
+
return text
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == '__main__':
|
108 |
+
process_audio_recognition(AUDIO_FILE_PATH, SOURCE_LANG, TARGET_LANG, translate=True)
|