Spaces:
Sleeping
Sleeping
"""Docstring""" | |
import os | |
import speech_recognition as sr | |
from pydub import AudioSegment | |
from pydub.silence import split_on_silence | |
from modules.console_colors import ULTRASINGER_HEAD | |
# todo: Code from here: https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python | |
def print_text(wav_file): | |
"""Docstring""" | |
# English speech! | |
recognizer = sr.Recognizer() | |
# open the file | |
with sr.AudioFile(wav_file) as source: | |
# listen for the data (load audio to memory) | |
audio_data = recognizer.record(source) | |
# recognize (convert from speech to text) | |
text = recognizer.recognize_google(audio_data) | |
print(text) | |
def get_large_audio_transcription(wav_file): | |
""" | |
Splitting the large audio file into chunks | |
and apply speech recognition on each of these chunks | |
""" | |
# open the audio file using pydub | |
sound = AudioSegment.from_wav(wav_file) | |
# split audio sound where silence is 700 miliseconds or more and get chunks | |
chunks = split_on_silence( | |
sound, | |
# experiment with this value for your target audio file | |
min_silence_len=500, | |
# adjust this per requirement | |
silence_thresh=sound.dBFS - 14, | |
# keep the silence for 1 second, adjustable as well | |
keep_silence=500, | |
) | |
folder_name = "audio-chunks" | |
# create a directory to store the audio chunks | |
if not os.path.isdir(folder_name): | |
os.mkdir(folder_name) | |
whole_text = "" | |
recognizer = sr.Recognizer() | |
# process each chunk | |
for i, audio_chunk in enumerate(chunks, start=1): | |
# export audio chunk and save it in | |
# the `folder_name` directory. | |
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav") | |
audio_chunk.export(chunk_filename, format="wav") | |
# recognize the chunk | |
with sr.AudioFile(chunk_filename) as source: | |
audio_listened = recognizer.record(source) | |
# try converting it to text | |
try: | |
text = recognizer.recognize_google(audio_listened) | |
except sr.UnknownValueError as error: | |
print("Error:", str(error)) | |
else: | |
text = f"{text.capitalize()}. " | |
print(chunk_filename, ":", text) | |
whole_text += text | |
# return the text for all chunks detected | |
return whole_text | |
def transcribe_audio(audio_file): | |
"""Docstring""" | |
recognizer = sr.Recognizer() | |
with sr.AudioFile(audio_file) as source: | |
audio = recognizer.record(source) | |
try: | |
transcript = recognizer.recognize_google(audio, show_all=True) | |
start_time = transcript["result"][0]["alternative"][0]["words"][0][ | |
"startTime" | |
] | |
end_time = transcript["result"][0]["alternative"][0]["words"][-1][ | |
"endTime" | |
] | |
return ( | |
transcript["result"][0]["alternative"][0]["transcript"], | |
start_time, | |
end_time, | |
) | |
except sr.UnknownValueError: | |
print(f"{ULTRASINGER_HEAD} Could not understand audio") | |
except sr.RequestError as error: | |
print(f"{ULTRASINGER_HEAD} Error with recognizing service; {error}") | |
class SpeechToText: | |
"""Docstring""" | |