How to Use This Model

Installation

pip install coqui-tts
Locate TTS/tts/layers/xtts/tokenizers.py in your site-packages directory.
Replace the tokenizers.py file with the tokenizers.py in this repository.
And you should be good to go!

Note: The model might not perform well on very long inputs. You can write your own text splitter to split longer inputs into shorter sentences based on your needs.

Example

Source Voice

Generated Voice

Inference Code

import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

device = "cuda:0" if torch.cuda.is_available() else "cpu"
xtts_checkpoint = "model.pth"
xtts_config = "config.json"
xtts_vocab = "vocab.json"


config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
XTTS_MODEL.to(device)

print("Model loaded successfully!")

# In case you are cloning from WhatsApp voice notes:
from pydub import AudioSegment

audio = AudioSegment.from_file("input-4.ogg", format="ogg")
audio.export("output.wav", format="wav")
print("Conversion complete!")

# Inference
tts_text = f"""یہ ٹی ٹی ایس کیسا ہے؟ اس کے بارے میں کچھ بتائیں"""
speaker_audio_file = "output.wav"
lang = "ur"

gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path=["output.wav"],
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

tts_texts = [tts_text]
wav_chunks = []
for text in tqdm(tts_texts):
    wav_chunk = XTTS_MODEL.inference(
        text=text,
        language=lang,
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=0.1,
        length_penalty=0.1,
        repetition_penalty=10.0,
        top_k=10,
        top_p=0.3,
    )
    wav_chunks.append(torch.tensor(wav_chunk["wav"]))

out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()

from IPython.display import Audio
Audio(out_wav, rate=24000)