Mahiruoshi commited on
Commit
69fa064
·
verified ·
1 Parent(s): ea40339

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -2
app.py CHANGED
@@ -189,7 +189,7 @@ def infer(
189
  torch.cuda.empty_cache()
190
  return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
191
 
192
-
193
  def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
194
  audio_fin = []
195
  ass_entries = []
@@ -244,6 +244,58 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
244
  with open(ass_filename, 'w', encoding='utf-8') as f:
245
  f.write(ass_header + '\n'.join(ass_entries))
246
  return (hps.data.sampling_rate, np.concatenate(audio_fin))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
 
249
  def infer_simple(
@@ -446,7 +498,7 @@ if __name__ == "__main__":
446
  value="つくし|なんではるひかげやったの?!!",
447
  )
448
  groupSize = gr.Slider(
449
- minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
450
  )
451
  silenceTime = gr.Slider(
452
  minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
 
189
  torch.cuda.empty_cache()
190
  return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
191
 
192
+ '''srt格式
193
  def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
194
  audio_fin = []
195
  ass_entries = []
 
244
  with open(ass_filename, 'w', encoding='utf-8') as f:
245
  f.write(ass_header + '\n'.join(ass_entries))
246
  return (hps.data.sampling_rate, np.concatenate(audio_fin))
247
+ '''
248
+
249
+
250
+ def format_srt_timestamp(seconds):
251
+ ms = int((seconds - int(seconds)) * 1000)
252
+ seconds = int(seconds)
253
+ hours = seconds // 3600
254
+ minutes = (seconds % 3600) // 60
255
+ seconds = seconds % 60
256
+ return f"{hours:02}:{minutes:02}:{seconds:02},{ms:03}"
257
+
258
+ def clean_sentence(sentence):
259
+ return sentence.replace('\n', '').replace('\r', '').replace(' ', '')
260
+
261
+ def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, spealerList, silenceTime):
262
+ audio_fin = []
263
+ srt_entries = []
264
+ start_time = 0
265
+
266
+ for i, sentence in enumerate(group):
267
+ try:
268
+ FakeSpeaker = sentence.split("|")[0]
269
+ SpeakersList = re.split('\n', spealerList)
270
+ if FakeSpeaker in list(hps.data.spk2id.keys()):
271
+ speaker = FakeSpeaker
272
+ for s in SpeakersList:
273
+ if FakeSpeaker == s.split("|")[1]:
274
+ speaker = s.split("|")[0]
275
+ if len(sentence)>2 and (sentence != '\n' or sentence != '\r' or sentence != '' or sentence != ' ' or sentence != '\r\n'):
276
+ clean_msg = clean_sentence(sentence.split("|")[-1])
277
+ audio = infer_simple((remove_annotations(clean_msg) + "。").replace(",。", "。").replace("。。", "。"), sdp_ratio, noise_scale, noise_scale_w, length_scale, speaker)
278
+ silence_frames = int(silenceTime * 44100) if is_chinese(sentence) else int(silenceTime * 44100)
279
+ silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
280
+ audio_fin.append(audio)
281
+ audio_fin.append(silence_data)
282
+
283
+ duration = len(audio) / sampling_rate
284
+ end_time = start_time + duration + silenceTime
285
+ srt_entries.append(f"{i+1}\n{format_srt_timestamp(start_time)} --> {format_srt_timestamp(end_time)}\n{clean_msg.replace('|', ':')}\n\n")
286
+ start_time = end_time
287
+ except:
288
+ pass
289
+
290
+ wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
291
+ srt_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.srt')
292
+
293
+ write(wav_filename, sampling_rate, np.concatenate(audio_fin))
294
+
295
+ with open(srt_filename, 'w', encoding='utf-8') as f:
296
+ f.writelines(srt_entries)
297
+ return (hps.data.sampling_rate, np.concatenate(audio_fin))
298
+
299
 
300
 
301
  def infer_simple(
 
498
  value="つくし|なんではるひかげやったの?!!",
499
  )
500
  groupSize = gr.Slider(
501
+ minimum=10, maximum=1000000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
502
  )
503
  silenceTime = gr.Slider(
504
  minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"