How to Use This Model
Installation
- pip install coqui-tts
- Locate TTS/tts/layers/xtts/tokenizers.py in your site-packages directory.
- Replace the tokenizers.py file with the tokenizers.py in this repository.
- And you should be good to go!
Note: The model might not perform well on very long inputs. You can write your own text splitter to split longer inputs into shorter sentences based on your needs.
Example
Source Voice
Generated Voice
Inference Code
import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
device = "cuda:0" if torch.cuda.is_available() else "cpu"
xtts_checkpoint = "model.pth"
xtts_config = "config.json"
xtts_vocab = "vocab.json"
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
XTTS_MODEL.to(device)
print("Model loaded successfully!")
# In case you are cloning from WhatsApp voice notes:
from pydub import AudioSegment
audio = AudioSegment.from_file("input-4.ogg", format="ogg")
audio.export("output.wav", format="wav")
print("Conversion complete!")
# Inference
tts_text = f"""یہ ٹی ٹی ایس کیسا ہے؟ اس کے بارے میں کچھ بتائیں"""
speaker_audio_file = "output.wav"
lang = "ur"
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
audio_path=["output.wav"],
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
max_ref_length=XTTS_MODEL.config.max_ref_len,
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)
tts_texts = [tts_text]
wav_chunks = []
for text in tqdm(tts_texts):
wav_chunk = XTTS_MODEL.inference(
text=text,
language=lang,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
temperature=0.1,
length_penalty=0.1,
repetition_penalty=10.0,
top_k=10,
top_p=0.3,
)
wav_chunks.append(torch.tensor(wav_chunk["wav"]))
out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
from IPython.display import Audio
Audio(out_wav, rate=24000)
- Downloads last month
- 7