File size: 4,156 Bytes
1e4a2ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import os
import sys
import pysrt
import codecs
import librosa
import asyncio
import requests
import tempfile
import numpy as np
import soundfile as sf
from edge_tts import Communicate
sys.path.append(os.getcwd())
from main.app.variables import translations
from main.app.core.ui import gr_info, gr_warning, gr_error
def synthesize_tts(prompt, voice, speed, output, pitch, google):
if not google: asyncio.run(Communicate(text=prompt, voice=voice, rate=f"+{speed}%" if speed >= 0 else f"{speed}%", pitch=f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz").save(output))
else:
response = requests.get(codecs.decode("uggcf://genafyngr.tbbtyr.pbz/genafyngr_ggf", "rot13"), params={"ie": "UTF-8", "q": prompt, "tl": voice, "ttsspeed": speed, "client": "tw-ob"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"})
if response.status_code == 200:
with open(output, "wb") as f:
f.write(response.content)
if pitch != 0 or speed != 0:
y, sr = librosa.load(output, sr=None)
if pitch != 0: y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
if speed != 0: y = librosa.effects.time_stretch(y, rate=speed)
sf.write(file=output, data=y, samplerate=sr, format=os.path.splitext(os.path.basename(output))[-1].lower().replace('.', ''))
else: gr_error(f"{response.status_code}, {response.text}")
def time_stretch(y, sr, target_duration):
rate = (len(y) / sr) / target_duration
if rate != 1.0: y = librosa.effects.time_stretch(y=y.astype(np.float32), rate=rate)
n_target = int(round(target_duration * sr))
return np.pad(y, (0, n_target - len(y))) if len(y) < n_target else y[:n_target]
def pysrttime_to_seconds(t):
return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
def srt_tts(srt_file, out_file, voice, rate = 0, sr = 24000, google = False):
subs = pysrt.open(srt_file)
if not subs: raise ValueError(translations["srt"])
final_audio = np.zeros(int(round(pysrttime_to_seconds(subs[-1].end) * sr)), dtype=np.float32)
with tempfile.TemporaryDirectory() as tempdir:
for idx, seg in enumerate(subs):
wav_path = os.path.join(tempdir, f"seg_{idx}.wav")
synthesize_tts(" ".join(seg.text.splitlines()), voice, 0, wav_path, rate, google)
audio, file_sr = sf.read(wav_path, dtype=np.float32)
if file_sr != sr: audio = np.interp(np.linspace(0, len(audio) - 1, int(len(audio) * sr / file_sr)), np.arange(len(audio)), audio)
adjusted = time_stretch(audio, sr, pysrttime_to_seconds(seg.duration))
start_sample = int(round(pysrttime_to_seconds(seg.start) * sr))
end_sample = start_sample + adjusted.shape[0]
if end_sample > final_audio.shape[0]:
adjusted = adjusted[: final_audio.shape[0] - start_sample]
end_sample = final_audio.shape[0]
final_audio[start_sample:end_sample] += adjusted
sf.write(out_file, final_audio, sr)
def TTS(prompt, voice, speed, output, pitch, google, srt_input):
if not srt_input: srt_input = ""
if not prompt and not srt_input.endswith(".srt"):
gr_warning(translations["enter_the_text"])
return None
if not voice:
gr_warning(translations["choose_voice"])
return None
if not output:
gr_warning(translations["output_not_valid"])
return None
if os.path.isdir(output): output = os.path.join(output, f"tts.wav")
gr_info(translations["convert"].format(name=translations["text"]))
output_dir = os.path.dirname(output) or output
if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
if srt_input.endswith(".srt"): srt_tts(srt_input, output, voice, 0, 24000, google)
else: synthesize_tts(prompt, voice, speed, output, pitch, google)
gr_info(translations["success"])
return output |