artificial-styletts2 / audiobook.py

Audionar long form

a1338da about 1 month ago

6.46 kB

	# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
	# __________________________________________________________________________________________________
	# ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
	# ROOT_DIR/voice/voice_full_book.wav

	import cv2
	import subprocess
	import numpy as np
	import soundfile
	import docx # package = python-docx

	import urllib
	from pathlib import Path
	from moviepy.editor import *

	FS = 16000
	ROOT_DIR = './tts_audiobooks/voices/'
	Path(ROOT_DIR).mkdir(parents=True,
	exist_ok=True)
	voices = [
	# 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
	# 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
	# 'en_US/vctk_low#p326',
	#'en_US/vctk_low#p292',
	# 'jv_ID_google-gmu_06207',
	# 'fr_FR_m-ailabs_bernard'
	'en_US_m-ailabs_mary_ann'
	] # select any voice from - https://audeering.github.io/shift/

	#urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")

	d = docx.Document('assets/audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'

	last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect

	chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME

	for vox in voices:

	# string cleanup

	vox_str = vox.replace(
	'/', '_').replace(
	'#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '').replace('-','')

	# create dir for chapter_x.wav & audiobook.wav - for this voice vox

	Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
	exist_ok=True)


	print(vox)

	# for new voice start list of audio tiles making up the 1st chapter of book

	total = []
	chapter = []

	final_paragraph_for_saving_last_chapter = d.paragraphs[-1]
	final_paragraph_for_saving_last_chapter.text = 'CHAPTER: END OF AUDIOBOOK'

	for para in d.paragraphs + [final_paragraph_for_saving_last_chapter,]: # final paragraph is only to go into if & save .wav of last CHAPTER
	t = para.text




	# start new chapter

	if t.startswith('CHAPTER:'):


	# chapter.wav

	audio = np.concatenate(chapter)

	soundfile.write(
	ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
	audio,
	16000) # 27400?

	# fill AUDIO of this chapter into total (for complete audiobook)

	total.append(audio)

	# new chapter

	chapter = []

	chapter_counter += 1

	print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')



	# If paragraph is non empty -> TTS

	if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:

	# place paragraph text to .txt for tts.py

	with open('_tmp.txt', 'w') as f:
	f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay




	# TTS

	subprocess.run(
	[
	"python",
	"tts.py",
	"--text",
	"_tmp.txt",
	'--soundscape', 'birds formig' if chapter_counter < 2 else '',
	'--voice', vox,
	'--out_file', '_tmp' # save on _tmp load audio and concat to total
	])

	audio, _fs = soundfile.read('out/_tmp.wav') # already 16 kHz
	# print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
	chapter.append(audio)

	# flag

	last_paragraph_was_silence = False

	# append silence if empty paragraph (e.g. end of Section)

	else:

	if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once

	last_paragraph_was_silence = True

	# save full .wav audiobook - for this voice

	soundfile.write(
	ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
	np.concatenate(total),
	16000) # 27400?




	# pic TTS voice

	voice_pic = np.zeros((1920, 1080, 3), dtype=np.uint8)

	# voice name
	# frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
	font = cv2.FONT_HERSHEY_SIMPLEX
	bottomLeftCornerOfText = (0, 640) # w,h
	fontScale = 2
	fontColor = (69, 74, 74)
	thickness = 4
	lineType = 2
	# voice
	cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
	bottomLeftCornerOfText,
	font,
	fontScale,
	fontColor,
	thickness,
	lineType)
	# = AUDIOBOOK
	cv2.putText(voice_pic, 'AUDIOBOOK',
	(170, 170),
	font,
	4,
	fontColor,
	thickness,
	lineType)
	# = VOICE
	cv2.putText(voice_pic, 'TTS voice =',
	(0, 500),
	font,
	fontScale,
	fontColor,
	thickness,
	lineType)
	STATIC_FRAME = '_tmp.png'
	cv2.imwrite(STATIC_FRAME, voice_pic)


	# MoviePy silence video


	SILENT_VIDEO = '_tmp.mp4'

	# SILENT CLIP

	clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first
	clip_silent.fps = 24
	clip_silent.write_videofile(SILENT_VIDEO)





	# fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video

	# write final output video
	subprocess.call(
	["ffmpeg",
	"-y",
	"-i",
	SILENT_VIDEO,
	"-i",
	ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
	"-c:v",
	"copy",
	"-map",
	"0:v:0",
	"-map",
	" 1:a:0",
	ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
	])