Spaces:
Running
Running
Mahiruoshi
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -189,7 +189,7 @@ def infer(
|
|
189 |
torch.cuda.empty_cache()
|
190 |
return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
|
191 |
|
192 |
-
|
193 |
def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
|
194 |
audio_fin = []
|
195 |
ass_entries = []
|
@@ -244,6 +244,58 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
244 |
with open(ass_filename, 'w', encoding='utf-8') as f:
|
245 |
f.write(ass_header + '\n'.join(ass_entries))
|
246 |
return (hps.data.sampling_rate, np.concatenate(audio_fin))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
|
249 |
def infer_simple(
|
@@ -446,7 +498,7 @@ if __name__ == "__main__":
|
|
446 |
value="つくし|なんではるひかげやったの?!!",
|
447 |
)
|
448 |
groupSize = gr.Slider(
|
449 |
-
minimum=10, maximum=
|
450 |
)
|
451 |
silenceTime = gr.Slider(
|
452 |
minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
|
|
|
189 |
torch.cuda.empty_cache()
|
190 |
return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
|
191 |
|
192 |
+
'''srt格式
|
193 |
def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
|
194 |
audio_fin = []
|
195 |
ass_entries = []
|
|
|
244 |
with open(ass_filename, 'w', encoding='utf-8') as f:
|
245 |
f.write(ass_header + '\n'.join(ass_entries))
|
246 |
return (hps.data.sampling_rate, np.concatenate(audio_fin))
|
247 |
+
'''
|
248 |
+
|
249 |
+
|
250 |
+
def format_srt_timestamp(seconds):
|
251 |
+
ms = int((seconds - int(seconds)) * 1000)
|
252 |
+
seconds = int(seconds)
|
253 |
+
hours = seconds // 3600
|
254 |
+
minutes = (seconds % 3600) // 60
|
255 |
+
seconds = seconds % 60
|
256 |
+
return f"{hours:02}:{minutes:02}:{seconds:02},{ms:03}"
|
257 |
+
|
258 |
+
def clean_sentence(sentence):
|
259 |
+
return sentence.replace('\n', '').replace('\r', '').replace(' ', '')
|
260 |
+
|
261 |
+
def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, spealerList, silenceTime):
|
262 |
+
audio_fin = []
|
263 |
+
srt_entries = []
|
264 |
+
start_time = 0
|
265 |
+
|
266 |
+
for i, sentence in enumerate(group):
|
267 |
+
try:
|
268 |
+
FakeSpeaker = sentence.split("|")[0]
|
269 |
+
SpeakersList = re.split('\n', spealerList)
|
270 |
+
if FakeSpeaker in list(hps.data.spk2id.keys()):
|
271 |
+
speaker = FakeSpeaker
|
272 |
+
for s in SpeakersList:
|
273 |
+
if FakeSpeaker == s.split("|")[1]:
|
274 |
+
speaker = s.split("|")[0]
|
275 |
+
if len(sentence)>2 and (sentence != '\n' or sentence != '\r' or sentence != '' or sentence != ' ' or sentence != '\r\n'):
|
276 |
+
clean_msg = clean_sentence(sentence.split("|")[-1])
|
277 |
+
audio = infer_simple((remove_annotations(clean_msg) + "。").replace(",。", "。").replace("。。", "。"), sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker)
|
278 |
+
silence_frames = int(silenceTime * 44100) if is_chinese(sentence) else int(silenceTime * 44100)
|
279 |
+
silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
|
280 |
+
audio_fin.append(audio)
|
281 |
+
audio_fin.append(silence_data)
|
282 |
+
|
283 |
+
duration = len(audio) / sampling_rate
|
284 |
+
end_time = start_time + duration + silenceTime
|
285 |
+
srt_entries.append(f"{i+1}\n{format_srt_timestamp(start_time)} --> {format_srt_timestamp(end_time)}\n{clean_msg.replace('|', ':')}\n\n")
|
286 |
+
start_time = end_time
|
287 |
+
except:
|
288 |
+
pass
|
289 |
+
|
290 |
+
wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
|
291 |
+
srt_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.srt')
|
292 |
+
|
293 |
+
write(wav_filename, sampling_rate, np.concatenate(audio_fin))
|
294 |
+
|
295 |
+
with open(srt_filename, 'w', encoding='utf-8') as f:
|
296 |
+
f.writelines(srt_entries)
|
297 |
+
return (hps.data.sampling_rate, np.concatenate(audio_fin))
|
298 |
+
|
299 |
|
300 |
|
301 |
def infer_simple(
|
|
|
498 |
value="つくし|なんではるひかげやったの?!!",
|
499 |
)
|
500 |
groupSize = gr.Slider(
|
501 |
+
minimum=10, maximum=1000000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
|
502 |
)
|
503 |
silenceTime = gr.Slider(
|
504 |
minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
|