sHORTgpt / shortGPT /audio /audio_utils.py
wiydarrr's picture
Upload folder using huggingface_hub
5f685fd verified
import os
import subprocess
import yt_dlp
from shortGPT.audio.audio_duration import get_asset_duration
CONST_CHARS_PER_SEC = 20.5 # Arrived to this result after whispering a ton of shorts and calculating the average number of characters per second of speech.
WHISPER_MODEL = None
def downloadYoutubeAudio(url, outputFile):
ydl_opts = {
"quiet": True,
"no_warnings": True,
"no_color": True,
"no_call_home": True,
"no_check_certificate": True,
"format": "bestaudio/best",
"outtmpl": outputFile
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
dictMeta = ydl.extract_info(
url,
download=True)
if (not os.path.exists(outputFile)):
raise Exception("Audio Download Failed")
return outputFile, dictMeta['duration']
except Exception as e:
print("Failed downloading audio from the following video/url", e.args[0])
return None
def speedUpAudio(tempAudioPath, outputFile, expected_duration=None): # Speeding up the audio to make it under 60secs, otherwise the output video is not considered as a short.
tempAudioPath, duration = get_asset_duration(tempAudioPath, False)
if not expected_duration:
if (duration > 57):
subprocess.run(['ffmpeg', '-i', tempAudioPath, '-af', f'atempo={(duration/57):.5f}', outputFile])
else:
subprocess.run(['ffmpeg', '-i', tempAudioPath, outputFile])
else:
subprocess.run(['ffmpeg', '-i', tempAudioPath, '-af', f'atempo={(duration/expected_duration):.5f}', outputFile])
if (os.path.exists(outputFile)):
return outputFile
def ChunkForAudio(alltext, chunk_size=2500):
alltext_list = alltext.split('.')
chunks = []
curr_chunk = ''
for text in alltext_list:
if len(curr_chunk) + len(text) <= chunk_size:
curr_chunk += text + '.'
else:
chunks.append(curr_chunk)
curr_chunk = text + '.'
if curr_chunk:
chunks.append(curr_chunk)
return chunks
def audioToText(filename, model_size="base"):
from whisper_timestamped import load_model, transcribe_timestamped
global WHISPER_MODEL
if (WHISPER_MODEL == None):
WHISPER_MODEL = load_model(model_size)
gen = transcribe_timestamped(WHISPER_MODEL, filename, verbose=False, fp16=False)
return gen
def getWordsPerSec(filename):
a = audioToText(filename)
return len(a['text'].split()) / a['segments'][-1]['end']
def getCharactersPerSec(filename):
a = audioToText(filename)
return len(a['text']) / a['segments'][-1]['end']
def run_background_audio_split(sound_file_path):
try:
# Run spleeter command
# Get absolute path of sound file
output_dir = os.path.dirname(sound_file_path)
command = f"spleeter separate -p spleeter:2stems -o '{output_dir}' '{sound_file_path}'"
process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# If spleeter runs successfully, return the path to the background music file
if process.returncode == 0:
return os.path.join(output_dir, sound_file_path.split("/")[-1].split(".")[0], "accompaniment.wav")
else:
return None
except Exception:
# If spleeter crashes, return None
return None