lest periodicity
Browse files- Modules/vits/attentions.py +0 -48
- Modules/vits/commons.py +2 -89
- Modules/vits/utils.py +0 -55
- api.py +20 -22
- demo.py +5 -5
- live_demo.py +2 -3
Modules/vits/attentions.py
CHANGED
@@ -47,55 +47,7 @@ class Encoder(nn.Module):
|
|
47 |
return x
|
48 |
|
49 |
|
50 |
-
class Decoder(nn.Module):
|
51 |
-
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
52 |
-
super().__init__()
|
53 |
-
self.hidden_channels = hidden_channels
|
54 |
-
self.filter_channels = filter_channels
|
55 |
-
self.n_heads = n_heads
|
56 |
-
self.n_layers = n_layers
|
57 |
-
self.kernel_size = kernel_size
|
58 |
-
self.p_dropout = p_dropout
|
59 |
-
self.proximal_bias = proximal_bias
|
60 |
-
self.proximal_init = proximal_init
|
61 |
|
62 |
-
self.drop = nn.Dropout(p_dropout)
|
63 |
-
self.self_attn_layers = nn.ModuleList()
|
64 |
-
self.norm_layers_0 = nn.ModuleList()
|
65 |
-
self.encdec_attn_layers = nn.ModuleList()
|
66 |
-
self.norm_layers_1 = nn.ModuleList()
|
67 |
-
self.ffn_layers = nn.ModuleList()
|
68 |
-
self.norm_layers_2 = nn.ModuleList()
|
69 |
-
for i in range(self.n_layers):
|
70 |
-
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
71 |
-
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
72 |
-
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
73 |
-
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
74 |
-
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
75 |
-
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
76 |
-
|
77 |
-
def forward(self, x, x_mask, h, h_mask):
|
78 |
-
"""
|
79 |
-
x: decoder input
|
80 |
-
h: encoder output
|
81 |
-
"""
|
82 |
-
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
83 |
-
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
84 |
-
x = x * x_mask
|
85 |
-
for i in range(self.n_layers):
|
86 |
-
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
87 |
-
y = self.drop(y)
|
88 |
-
x = self.norm_layers_0[i](x + y)
|
89 |
-
|
90 |
-
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
91 |
-
y = self.drop(y)
|
92 |
-
x = self.norm_layers_1[i](x + y)
|
93 |
-
|
94 |
-
y = self.ffn_layers[i](x, x_mask)
|
95 |
-
y = self.drop(y)
|
96 |
-
x = self.norm_layers_2[i](x + y)
|
97 |
-
x = x * x_mask
|
98 |
-
return x
|
99 |
|
100 |
|
101 |
class MultiHeadAttention(nn.Module):
|
|
|
47 |
return x
|
48 |
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
|
53 |
class MultiHeadAttention(nn.Module):
|
Modules/vits/commons.py
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
import math
|
2 |
import numpy as np
|
3 |
import torch
|
4 |
-
from torch import nn
|
5 |
from torch.nn import functional as F
|
6 |
|
7 |
-
|
8 |
def init_weights(m, mean=0.0, std=0.01):
|
9 |
classname = m.__class__.__name__
|
10 |
if classname.find("Conv") != -1:
|
@@ -15,13 +13,8 @@ def get_padding(kernel_size, dilation=1):
|
|
15 |
return int((kernel_size*dilation - dilation)/2)
|
16 |
|
17 |
|
18 |
-
def convert_pad_shape(pad_shape):
|
19 |
-
l = pad_shape[::-1]
|
20 |
-
pad_shape = [item for sublist in l for item in sublist]
|
21 |
-
return pad_shape
|
22 |
-
|
23 |
-
|
24 |
def intersperse(lst, item):
|
|
|
25 |
result = [item] * (len(lst) * 2 + 1)
|
26 |
result[1::2] = lst
|
27 |
return result
|
@@ -40,63 +33,6 @@ def rand_gumbel(shape):
|
|
40 |
return -torch.log(-torch.log(uniform_samples))
|
41 |
|
42 |
|
43 |
-
def rand_gumbel_like(x):
|
44 |
-
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
45 |
-
return g
|
46 |
-
|
47 |
-
|
48 |
-
def slice_segments(x, ids_str, segment_size=4):
|
49 |
-
ret = torch.zeros_like(x[:, :, :segment_size])
|
50 |
-
for i in range(x.size(0)):
|
51 |
-
idx_str = ids_str[i]
|
52 |
-
idx_end = idx_str + segment_size
|
53 |
-
ret[i] = x[i, :, idx_str:idx_end]
|
54 |
-
return ret
|
55 |
-
|
56 |
-
|
57 |
-
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
58 |
-
b, d, t = x.size()
|
59 |
-
if x_lengths is None:
|
60 |
-
x_lengths = t
|
61 |
-
ids_str_max = x_lengths - segment_size + 1
|
62 |
-
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
63 |
-
ret = slice_segments(x, ids_str, segment_size)
|
64 |
-
return ret, ids_str
|
65 |
-
|
66 |
-
|
67 |
-
def get_timing_signal_1d(
|
68 |
-
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
69 |
-
position = torch.arange(length, dtype=torch.float)
|
70 |
-
num_timescales = channels // 2
|
71 |
-
log_timescale_increment = (
|
72 |
-
math.log(float(max_timescale) / float(min_timescale)) /
|
73 |
-
(num_timescales - 1))
|
74 |
-
inv_timescales = min_timescale * torch.exp(
|
75 |
-
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
76 |
-
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
77 |
-
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
78 |
-
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
79 |
-
signal = signal.view(1, channels, length)
|
80 |
-
return signal
|
81 |
-
|
82 |
-
|
83 |
-
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
84 |
-
b, channels, length = x.size()
|
85 |
-
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
86 |
-
return x + signal.to(dtype=x.dtype, device=x.device)
|
87 |
-
|
88 |
-
|
89 |
-
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
90 |
-
b, channels, length = x.size()
|
91 |
-
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
92 |
-
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
93 |
-
|
94 |
-
|
95 |
-
def subsequent_mask(length):
|
96 |
-
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
97 |
-
return mask
|
98 |
-
|
99 |
-
|
100 |
@torch.jit.script
|
101 |
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
102 |
n_channels_int = n_channels[0]
|
@@ -113,11 +49,6 @@ def convert_pad_shape(pad_shape):
|
|
113 |
return pad_shape
|
114 |
|
115 |
|
116 |
-
def shift_1d(x):
|
117 |
-
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
118 |
-
return x
|
119 |
-
|
120 |
-
|
121 |
def sequence_mask(length, max_length=None):
|
122 |
if max_length is None:
|
123 |
max_length = length.max()
|
@@ -140,22 +71,4 @@ def generate_path(duration, mask):
|
|
140 |
path = path.view(b, t_x, t_y)
|
141 |
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
142 |
path = path.unsqueeze(1).transpose(2,3) * mask
|
143 |
-
return path
|
144 |
-
|
145 |
-
|
146 |
-
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
147 |
-
if isinstance(parameters, torch.Tensor):
|
148 |
-
parameters = [parameters]
|
149 |
-
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
150 |
-
norm_type = float(norm_type)
|
151 |
-
if clip_value is not None:
|
152 |
-
clip_value = float(clip_value)
|
153 |
-
|
154 |
-
total_norm = 0
|
155 |
-
for p in parameters:
|
156 |
-
param_norm = p.grad.data.norm(norm_type)
|
157 |
-
total_norm += param_norm.item() ** norm_type
|
158 |
-
if clip_value is not None:
|
159 |
-
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
160 |
-
total_norm = total_norm ** (1. / norm_type)
|
161 |
-
return total_norm
|
|
|
1 |
import math
|
2 |
import numpy as np
|
3 |
import torch
|
|
|
4 |
from torch.nn import functional as F
|
5 |
|
|
|
6 |
def init_weights(m, mean=0.0, std=0.01):
|
7 |
classname = m.__class__.__name__
|
8 |
if classname.find("Conv") != -1:
|
|
|
13 |
return int((kernel_size*dilation - dilation)/2)
|
14 |
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def intersperse(lst, item):
|
17 |
+
# needed for inference
|
18 |
result = [item] * (len(lst) * 2 + 1)
|
19 |
result[1::2] = lst
|
20 |
return result
|
|
|
33 |
return -torch.log(-torch.log(uniform_samples))
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
@torch.jit.script
|
37 |
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
38 |
n_channels_int = n_channels[0]
|
|
|
49 |
return pad_shape
|
50 |
|
51 |
|
|
|
|
|
|
|
|
|
|
|
52 |
def sequence_mask(length, max_length=None):
|
53 |
if max_length is None:
|
54 |
max_length = length.max()
|
|
|
71 |
path = path.view(b, t_x, t_y)
|
72 |
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
73 |
path = path.unsqueeze(1).transpose(2,3) * mask
|
74 |
+
return path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Modules/vits/utils.py
CHANGED
@@ -56,25 +56,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
|
|
56 |
'learning_rate': learning_rate}, checkpoint_path)
|
57 |
|
58 |
|
59 |
-
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
60 |
-
for k, v in scalars.items():
|
61 |
-
writer.add_scalar(k, v, global_step)
|
62 |
-
for k, v in histograms.items():
|
63 |
-
writer.add_histogram(k, v, global_step)
|
64 |
-
for k, v in images.items():
|
65 |
-
writer.add_image(k, v, global_step, dataformats='HWC')
|
66 |
-
for k, v in audios.items():
|
67 |
-
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
68 |
-
|
69 |
-
|
70 |
-
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
71 |
-
f_list = glob.glob(os.path.join(dir_path, regex))
|
72 |
-
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
73 |
-
x = f_list[-1]
|
74 |
-
print(x)
|
75 |
-
return x
|
76 |
-
|
77 |
-
|
78 |
def plot_spectrogram_to_numpy(spectrogram):
|
79 |
global MATPLOTLIB_FLAG
|
80 |
if not MATPLOTLIB_FLAG:
|
@@ -190,42 +171,6 @@ def get_hparams_from_file(config_path):
|
|
190 |
hparams =HParams(**config)
|
191 |
return hparams
|
192 |
|
193 |
-
|
194 |
-
def check_git_hash(model_dir):
|
195 |
-
source_dir = os.path.dirname(os.path.realpath(__file__))
|
196 |
-
if not os.path.exists(os.path.join(source_dir, ".git")):
|
197 |
-
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
|
198 |
-
source_dir
|
199 |
-
))
|
200 |
-
return
|
201 |
-
|
202 |
-
cur_hash = subprocess.getoutput("git rev-parse HEAD")
|
203 |
-
|
204 |
-
path = os.path.join(model_dir, "githash")
|
205 |
-
if os.path.exists(path):
|
206 |
-
saved_hash = open(path).read()
|
207 |
-
if saved_hash != cur_hash:
|
208 |
-
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
|
209 |
-
saved_hash[:8], cur_hash[:8]))
|
210 |
-
else:
|
211 |
-
open(path, "w").write(cur_hash)
|
212 |
-
|
213 |
-
|
214 |
-
def get_logger(model_dir, filename="train.log"):
|
215 |
-
global logger
|
216 |
-
logger = logging.getLogger(os.path.basename(model_dir))
|
217 |
-
logger.setLevel(logging.DEBUG)
|
218 |
-
|
219 |
-
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
220 |
-
if not os.path.exists(model_dir):
|
221 |
-
os.makedirs(model_dir)
|
222 |
-
h = logging.FileHandler(os.path.join(model_dir, filename))
|
223 |
-
h.setLevel(logging.DEBUG)
|
224 |
-
h.setFormatter(formatter)
|
225 |
-
logger.addHandler(h)
|
226 |
-
return logger
|
227 |
-
|
228 |
-
|
229 |
class HParams():
|
230 |
def __init__(self, **kwargs):
|
231 |
for k, v in kwargs.items():
|
|
|
56 |
'learning_rate': learning_rate}, checkpoint_path)
|
57 |
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def plot_spectrogram_to_numpy(spectrogram):
|
60 |
global MATPLOTLIB_FLAG
|
61 |
if not MATPLOTLIB_FLAG:
|
|
|
171 |
hparams =HParams(**config)
|
172 |
return hparams
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
class HParams():
|
175 |
def __init__(self, **kwargs):
|
176 |
for k, v in kwargs.items():
|
api.py
CHANGED
@@ -17,7 +17,7 @@ from flask_cors import CORS
|
|
17 |
from moviepy.editor import *
|
18 |
from audiocraft.builders import AudioGen
|
19 |
CACHE_DIR = 'flask_cache/'
|
20 |
-
NUM_SOUND_GENERATIONS =
|
21 |
|
22 |
sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
|
23 |
|
@@ -88,54 +88,51 @@ def overlay(x, soundscape=None):
|
|
88 |
background = audresample.resample(
|
89 |
background,
|
90 |
original_rate=16000, # sound_generator.sample_rate,
|
91 |
-
target_rate=24000)[0, :-25000
|
92 |
-
# TODO discards last sampls due to splash sound / polarity change / on long sounds ~ videos / NOT DROP FOR live_demo.py
|
93 |
|
94 |
-
|
95 |
-
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
|
100 |
k = background.shape[0]
|
101 |
|
102 |
|
103 |
|
104 |
-
|
105 |
|
106 |
|
107 |
|
108 |
|
109 |
-
|
|
|
110 |
n_repeat = len(x) // hop
|
111 |
total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
|
112 |
|
113 |
m = np.ones(k)
|
114 |
overlap = k - hop
|
115 |
m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
|
116 |
-
|
117 |
|
118 |
for j in range(n_repeat):
|
119 |
-
|
120 |
-
|
121 |
-
# total[j * (k+hop):(j+1) * k + j*hop] =background
|
122 |
-
total[j*hop:j*hop + k] += m * background # the total is already smoothly falling due to the previous mask. Is only the new added signal that needs to rise smoothl
|
123 |
# total = total.clip(-1, 1) # if too many signals were added on top of each other
|
124 |
# print(total[40000:70000].tolist())
|
125 |
print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
|
126 |
|
127 |
-
# less periodic
|
|
|
128 |
for _ in range(4):
|
129 |
-
|
130 |
-
|
131 |
-
#
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
135 |
else:
|
|
|
136 |
print('sound_background = None')
|
|
|
137 |
return x
|
138 |
|
|
|
139 |
def tts_multi_sentence(precomputed_style_vector=None,
|
140 |
text=None,
|
141 |
voice=None,
|
@@ -258,6 +255,7 @@ def serve_wav():
|
|
258 |
with open(args.text, 'r') as f:
|
259 |
t = ''.join(f)
|
260 |
t = re.sub(' +', ' ', t) # delete spaces
|
|
|
261 |
text = split_into_sentences(t) # split to short sentences (~100 phonemes max for OOM)
|
262 |
|
263 |
# ====STYLE VECTOR====
|
|
|
17 |
from moviepy.editor import *
|
18 |
from audiocraft.builders import AudioGen
|
19 |
CACHE_DIR = 'flask_cache/'
|
20 |
+
NUM_SOUND_GENERATIONS = 3 # batch size to generate same text (same soundscape for long video)
|
21 |
|
22 |
sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
|
23 |
|
|
|
88 |
background = audresample.resample(
|
89 |
background,
|
90 |
original_rate=16000, # sound_generator.sample_rate,
|
91 |
+
target_rate=24000)[0, :-250] # last samples have splash sounds DISCARD 25000 last samples
|
|
|
92 |
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
k = background.shape[0]
|
95 |
|
96 |
|
97 |
|
|
|
98 |
|
99 |
|
100 |
|
101 |
|
102 |
+
|
103 |
+
hop = int(.99 * k) # only overlap 10%
|
104 |
n_repeat = len(x) // hop
|
105 |
total = np.zeros( hop * (n_repeat + 2)) # add some extra pad space for last frame to fit
|
106 |
|
107 |
m = np.ones(k)
|
108 |
overlap = k - hop
|
109 |
m[hop:] = np.linspace(1, 0, overlap) # tril mask for avg sound in the interpolated hop
|
110 |
+
|
111 |
|
112 |
for j in range(n_repeat):
|
113 |
+
total[j*hop:j*hop + k] += m * background # the total is already smoothly fading due to the previous mask. Only new addition of signal needs to rise smoothly
|
114 |
+
print((total < -1).sum(), (total > 1).sum(), 'OUTOF BOUNDS\n\n\n\n')
|
|
|
|
|
115 |
# total = total.clip(-1, 1) # if too many signals were added on top of each other
|
116 |
# print(total[40000:70000].tolist())
|
117 |
print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
|
118 |
|
119 |
+
# less periodic
|
120 |
+
|
121 |
for _ in range(4):
|
122 |
+
total = _shift(total)
|
123 |
+
|
124 |
+
# amplify sounds full [-1,1]
|
125 |
+
|
126 |
+
total /= np.abs(total).max() + 1e-7
|
127 |
+
x = .4 * x + .6 * total[:len(x)]
|
128 |
+
|
129 |
else:
|
130 |
+
|
131 |
print('sound_background = None')
|
132 |
+
|
133 |
return x
|
134 |
|
135 |
+
|
136 |
def tts_multi_sentence(precomputed_style_vector=None,
|
137 |
text=None,
|
138 |
voice=None,
|
|
|
255 |
with open(args.text, 'r') as f:
|
256 |
t = ''.join(f)
|
257 |
t = re.sub(' +', ' ', t) # delete spaces
|
258 |
+
# -- sub all punctuation with ' '
|
259 |
text = split_into_sentences(t) # split to short sentences (~100 phonemes max for OOM)
|
260 |
|
261 |
# ====STYLE VECTOR====
|
demo.py
CHANGED
@@ -4,9 +4,9 @@ import msinference
|
|
4 |
|
5 |
|
6 |
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
|
7 |
-
voice='af_ZA_google-nwu_1919', # 'serbian',
|
8 |
-
speed=1.4, # only for
|
9 |
-
affect = True # False =
|
10 |
):
|
11 |
'''returns 24kHZ np.array TTS
|
12 |
|
@@ -51,8 +51,8 @@ def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are
|
|
51 |
|
52 |
else:
|
53 |
|
54 |
-
#
|
55 |
-
x = msinference.foreign(text=text,
|
56 |
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
|
57 |
speed=speed) # normalisation externally
|
58 |
|
|
|
4 |
|
5 |
|
6 |
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
|
7 |
+
voice='af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
|
8 |
+
speed=1.4, # only for MMS TTS
|
9 |
+
affect = True # False = higher clarity sound for partially sight
|
10 |
):
|
11 |
'''returns 24kHZ np.array TTS
|
12 |
|
|
|
51 |
|
52 |
else:
|
53 |
|
54 |
+
# MMS TTS - list of sentences
|
55 |
+
x = msinference.foreign(text=[text],
|
56 |
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
|
57 |
speed=speed) # normalisation externally
|
58 |
|
live_demo.py
CHANGED
@@ -31,9 +31,8 @@ while True:
|
|
31 |
_str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
|
32 |
args.soundscape = _str
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
_str += 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata few silence for audiogen to impress you.'
|
37 |
args.text = '_tmp.txt' # input -> .txt (implementation thought for audiobooks in API)
|
38 |
|
39 |
with open(args.text, 'w') as f:
|
|
|
31 |
_str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
|
32 |
args.soundscape = _str
|
33 |
|
34 |
+
_str += 'Lorem ipsum dolor sit amet, consetetur elixir sed diam nonumy eirmod tempor invidunt labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Soutet clita kasd gubergren.'
|
35 |
+
|
|
|
36 |
args.text = '_tmp.txt' # input -> .txt (implementation thought for audiobooks in API)
|
37 |
|
38 |
with open(args.text, 'w') as f:
|