artificial-styletts2 / audiobook.py
Dionyssos's picture
fx audiobook (TODO cleanup .docx)
25b87f7
# FOR EACH VOICE -> create .wav file per chapter & full audiobook.wav from assets/INCLUSION_IN_MUSEUMS_audiobook.docx
#
# Chapters
#
# ROOT_DIR/voice/voxstr_CHAPTER_0.wav
# ..
# ROOT_DIR/voice/voxstr_CHAPTER_10.wav
# ROOT_DIR/voice/voxstr_full_book.wav
#
# Full AudioBook
#
# ROOT_DIR/full_audiobook_all_voices.wav
import cv2
import subprocess
import numpy as np
import soundfile
import docx # pip install python-docx
from pathlib import Path
from moviepy.editor import *
FS = 24000
ROOT_DIR = './tts_audiobooks/voices/'
Path(ROOT_DIR).mkdir(parents=True,
exist_ok=True)
voices = [
# 'en_US/hifi-tts_low#9017' ,
'en_US/m-ailabs_low#mary_ann',
'en_US/cmu-arctic_low#jmk',
# 'en_US/cmu-arctic_low#eey',
'en_UK/apope_low'
] # select any voice from - https://audeering.github.io/shift/
d = docx.Document('../shift/assets/INCLUSION_IN_MUSEUMS_audiobook.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME
youtube_video_parts = [] # audiobook .mp4 from each voice
for vox in voices:
# string (map for assets/)
vox_str = vox.replace(
'/', '_').replace(
'#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '').replace('-','')
# create dir for chapter_x.wav & audiobook.wav - for this voice vox
Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
exist_ok=True)
print(vox)
# for new voice start list of audio tiles making up the 1st chapter of book
total = []
chapter = []
for para in d.paragraphs[:41]:
t = para.text
# start new chapter
if t.startswith('CHAPTER:'):
# silence for end chapter
chapter.append(np.zeros(int(.1 * FS),
dtype=np.float32))
# chapter.wav
audio = np.concatenate(chapter)
soundfile.write(
ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
audio,
FS) # 27400?
# fill AUDIO of this chapter into total (for complete audiobook)
total.append(audio)
# new chapter
chapter = []
chapter_counter += 1
# If paragraph is non empty -> TTS
if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
# place paragraph text to .txt for tts.py
with open('_tmp.txt', 'w') as f:
f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
print(t,'\n_____________________________\n')
# TTS
subprocess.run(
[
"python",
"tts.py",
"--text",
"_tmp.txt", #t, # paragraph text tts and append to voice_chapter.wav
# "--affect",
#'--image', '_tmp_banner.png',
# '--scene', 'calm sounds of castle',
'--voice', vox,
'--out_file', '_tmp' # save on _tmp load audio and concat to total
])
audio, _fs = soundfile.read('out/_tmp.wav')
print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
chapter.append(audio)
# flag
last_paragraph_was_silence = False
# append silence if empty paragraph (e.g. end of Section)
else:
if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once
chapter.append(np.zeros(int(.1 * FS),
dtype=np.float32))
last_paragraph_was_silence = True
# save full .wav audiobook - for this voice
soundfile.write(
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
np.concatenate(total),
FS) # 27400?
# pic TTS voice
voice_pic = np.zeros((768, 1024, 3), dtype=np.uint8)
shift_logo = cv2.imread('assets/shift_banner.png')
voice_pic[:100, :400, :] = shift_logo[:100, :400, :]
# voice name
# frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
font = cv2.FONT_HERSHEY_SIMPLEX
bottomLeftCornerOfText = (0, 640) # w,h
fontScale = 2
fontColor = (69, 74, 74)
thickness = 4
lineType = 2
# voice
cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
bottomLeftCornerOfText,
font,
fontScale,
fontColor,
thickness,
lineType)
# =
cv2.putText(voice_pic, 'TTS voice =',
(0, 500),
font,
fontScale,
fontColor,
thickness,
lineType)
STATIC_FRAME = '_tmp.png'
cv2.imwrite(STATIC_FRAME, voice_pic)
# MoviePy silence video
SILENT_VIDEO = '_tmp.mp4'
# SILENT CLIP
clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first
clip_silent.write_videofile(SILENT_VIDEO, fps=24)
# fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
# write final output video
subprocess.call(
["ffmpeg",
"-y",
"-i",
SILENT_VIDEO,
"-i",
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
"-c:v",
"copy",
"-map",
"0:v:0",
"-map",
" 1:a:0",
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
])
youtube_video_parts.append(ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4')
# Final vid for YouTube
with open('_youtube_video_parts.txt', 'w') as f:
_str = 'file ' + ' \n file '.join(youtube_video_parts)
f.write(_str)
# # list of audiobooks of single vox
# # --
# # $ cat mylist.txt
# # file '/path/to/file1'
# # file '/path/to/file2'
# # file '/path/to/file3'
youtube_video_file = 'audiobook_shift_youtube.mp4'
# ffmpeg -f concat -i video_parts.txt -c copy output.mp4
subprocess.call(
["ffmpeg",
"-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
"-safe",
"0", # https://stackoverflow.com/questions/38996925/ffmpeg-concat-unsafe-file-name
"-f",
"concat", # https://stackoverflow.com/questions/7333232/how-to-concatenate-two-mp4-files-using-ffmpeg
"-i",
'_youtube_video_parts.txt',
"-c",
"copy",
youtube_video_file]
)