TIMBOVILL's picture
Upload 3 files
80d8416 verified
"""Docstring"""
import os
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import split_on_silence
from modules.console_colors import ULTRASINGER_HEAD
# todo: Code from here: https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python
def print_text(wav_file):
"""Docstring"""
# English speech!
recognizer = sr.Recognizer()
# open the file
with sr.AudioFile(wav_file) as source:
# listen for the data (load audio to memory)
audio_data = recognizer.record(source)
# recognize (convert from speech to text)
text = recognizer.recognize_google(audio_data)
print(text)
def get_large_audio_transcription(wav_file):
"""
Splitting the large audio file into chunks
and apply speech recognition on each of these chunks
"""
# open the audio file using pydub
sound = AudioSegment.from_wav(wav_file)
# split audio sound where silence is 700 miliseconds or more and get chunks
chunks = split_on_silence(
sound,
# experiment with this value for your target audio file
min_silence_len=500,
# adjust this per requirement
silence_thresh=sound.dBFS - 14,
# keep the silence for 1 second, adjustable as well
keep_silence=500,
)
folder_name = "audio-chunks"
# create a directory to store the audio chunks
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
whole_text = ""
recognizer = sr.Recognizer()
# process each chunk
for i, audio_chunk in enumerate(chunks, start=1):
# export audio chunk and save it in
# the `folder_name` directory.
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
# recognize the chunk
with sr.AudioFile(chunk_filename) as source:
audio_listened = recognizer.record(source)
# try converting it to text
try:
text = recognizer.recognize_google(audio_listened)
except sr.UnknownValueError as error:
print("Error:", str(error))
else:
text = f"{text.capitalize()}. "
print(chunk_filename, ":", text)
whole_text += text
# return the text for all chunks detected
return whole_text
def transcribe_audio(audio_file):
"""Docstring"""
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio = recognizer.record(source)
try:
transcript = recognizer.recognize_google(audio, show_all=True)
start_time = transcript["result"][0]["alternative"][0]["words"][0][
"startTime"
]
end_time = transcript["result"][0]["alternative"][0]["words"][-1][
"endTime"
]
return (
transcript["result"][0]["alternative"][0]["transcript"],
start_time,
end_time,
)
except sr.UnknownValueError:
print(f"{ULTRASINGER_HEAD} Could not understand audio")
except sr.RequestError as error:
print(f"{ULTRASINGER_HEAD} Error with recognizing service; {error}")
class SpeechToText:
"""Docstring"""