File size: 6,913 Bytes
5b7599e
 
 
 
25b87f7
 
 
 
 
5b7599e
 
 
25b87f7
 
 
 
 
 
 
 
229707e
9d6172b
 
 
5b7599e
 
229707e
25b87f7
9d6172b
25b87f7
 
 
 
 
 
 
5b7599e
 
25b87f7
 
 
 
 
5b7599e
25b87f7
5b7599e
25b87f7
 
5b7599e
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
 
6d04b6d
 
f1368b1
5b7599e
6d04b6d
25b87f7
5b7599e
 
 
 
25b87f7
5b7599e
25b87f7
5b7599e
 
 
25b87f7
5b7599e
 
 
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
 
 
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
 
 
 
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
 
5b7599e
 
 
 
25b87f7
5b7599e
25b87f7
 
 
 
 
 
 
 
 
 
 
5b7599e
 
25b87f7
5b7599e
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
 
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
 
25b87f7
5b7599e
25b87f7
5b7599e
25b87f7
5b7599e
 
 
 
25b87f7
 
 
 
5b7599e
 
25b87f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b7599e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25b87f7
5b7599e
 
 
 
 
 
25b87f7
 
5b7599e
 
25b87f7
5b7599e
 
25b87f7
 
 
 
 
 
 
 
 
5b7599e
 
25b87f7
 
 
 
 
5b7599e
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
# __________________________________________________________________________________________________
#   ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
#   ROOT_DIR/voice/voice_full_book.wav

import cv2
import subprocess
import numpy as np
import soundfile
import docx  # package = python-docx
import audresample
import urllib
from pathlib import Path
from moviepy.editor import *

FS = 24000
ROOT_DIR = './tts_audiobooks/voices/'
Path(ROOT_DIR).mkdir(parents=True,
                     exist_ok=True)
voices = [
    # 'en_US/vctk_low#p228',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
    # 'af_ZA_google-nwu_0184',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
    'en_US/vctk_low#p326',   # Native voice
#    'jv_ID_google-gmu_06207',
    ]  # select any voice from - https://audeering.github.io/shift/

#urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")

d = docx.Document('assets/audiobook_TTS.docx')  # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'

last_paragraph_was_silence = False  # to know to add silence only once after only at the 1st empty paragraph we detect

chapter_counter = 0  # assure chapters start with CHAPTER: ONCE UPON A TIME

for vox in voices:

    # string cleanup

    vox_str = vox.replace(
                '/', '_').replace(
                '#', '_').replace(
                'cmu-arctic', 'cmu_arctic').replace(
                '_low', '').replace('-','')

    # create dir for chapter_x.wav & audiobook.wav - for this voice vox

    Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
                                         exist_ok=True)


    print(vox)

    # for new voice start list of audio tiles making up the 1st chapter of book

    total = []
    chapter = []
    
    final_paragraph_for_saving_last_chapter = d.paragraphs[-1]
    final_paragraph_for_saving_last_chapter.text = 'CHAPTER: END OF AUDIOBOOK'

    for para in d.paragraphs + [final_paragraph_for_saving_last_chapter,]:   # final paragraph is only to go into if & save .wav of last CHAPTER
        t = para.text




        # start new chapter

        if t.startswith('CHAPTER:'):



            # silence for end chapter

            chapter.append(np.zeros(int(.24 * FS),
            dtype=np.float32))

            # chapter.wav

            audio = np.concatenate(chapter)

            soundfile.write(
                        ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
                        audio,
                        16000)  # 27400?

            # fill AUDIO of this chapter into total (for complete audiobook)

            total.append(audio)

            # new chapter

            chapter = []

            chapter_counter += 1

            print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')



        # If paragraph is non empty -> TTS

        if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:

            # place paragraph text to .txt for tts.py

            with open('_tmp.txt', 'w') as f:
                f.write(t.lower())  # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay




            # TTS

            subprocess.run(
                [
                "python",
                "tts.py",
                "--text", 
                "_tmp.txt", #t,         # paragraph text tts and append to voice_chapter.wav
                # "--affect",
                #'--image', '_tmp_banner.png',
                # '--scene', 'calm sounds of castle',
                '--voice', vox,
                '--out_file', '_tmp'  # save on _tmp load audio and concat to total
                ])

            audio, _fs = soundfile.read('out/_tmp.wav')
            audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
            # print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
            chapter.append(audio)

            # flag

            last_paragraph_was_silence = False

            # append silence if empty paragraph (e.g. end of Section)

        else:

            if not last_paragraph_was_silence:  # skip multiple empty pargraphs - silence is added only once

                chapter.append(np.zeros(int(.1 * FS), 
                               dtype=np.float32))

                last_paragraph_was_silence = True

    # save full .wav audiobook - for this voice

    soundfile.write(
            ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
            np.concatenate(total),
            16000)  # 27400?




    # pic TTS voice

    voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)

    shift_logo = cv2.imread('assets/shift_banner.png')

    voice_pic[:100, :400, :] = shift_logo[:100, :400, :]

    # voice name
    # frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
    font                   = cv2.FONT_HERSHEY_SIMPLEX
    bottomLeftCornerOfText = (0, 640)  # w,h
    fontScale              = 2
    fontColor              = (69, 74, 74)
    thickness              = 4
    lineType               = 2
    # voice
    cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
                bottomLeftCornerOfText,
                font,
                fontScale,
                fontColor,
                thickness,
                lineType)
    # = AUDIOBOOK
    cv2.putText(voice_pic, 'AUDIOBOOK',
                (170, 170),
                font,
                4,
                fontColor,
                thickness,
                lineType)
    # = VOICE
    cv2.putText(voice_pic, 'TTS voice =',
                (0, 500),
                font,
                fontScale,
                fontColor,
                thickness,
                lineType)
    STATIC_FRAME = '_tmp.png'
    cv2.imwrite(STATIC_FRAME, voice_pic)


    # MoviePy silence video


    SILENT_VIDEO = '_tmp.mp4'

    # SILENT CLIP

    clip_silent = ImageClip(STATIC_FRAME).set_duration(5)  # as long as the audio - TTS first
    clip_silent.write_videofile(SILENT_VIDEO, fps=24)





    # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video

    # write final output video
    subprocess.call(
        ["ffmpeg",
        "-y",
        "-i",
        SILENT_VIDEO,
        "-i",
        ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
        "-c:v",
        "copy",
        "-map",
        "0:v:0",
        "-map",
        " 1:a:0",
        ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4',       #  OUT_FILE
        ])