|
import torch |
|
import torchaudio |
|
from tqdm import tqdm |
|
from underthesea import sent_tokenize |
|
import os |
|
|
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
checkpoint_dir = "/export/home/vivian/svarah/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-March-12-2025_11+28PM-8e59ec3" |
|
|
|
xtts_checkpoint = os.path.join(checkpoint_dir, "best_model.pth") |
|
xtts_config = os.path.join(checkpoint_dir, "config.json") |
|
xtts_vocab = "/export/home/vivian/svarah/XTTSv2-Finetuning-for-New-Languages/checkpoints/XTTS_v2.0_original_model_files/vocab.json" |
|
speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") |
|
|
|
|
|
config = XttsConfig() |
|
config.load_json(xtts_config) |
|
|
|
|
|
tokenizer = VoiceBpeTokenizer(xtts_vocab) |
|
|
|
|
|
XTTS_MODEL = Xtts.init_from_config(config) |
|
|
|
|
|
XTTS_MODEL.load_checkpoint( |
|
config, |
|
checkpoint_path=xtts_checkpoint, |
|
checkpoint_dir=checkpoint_dir, |
|
vocab_path=xtts_vocab, |
|
speaker_file_path=speaker_file_path, |
|
use_deepspeed=False, |
|
) |
|
XTTS_MODEL.to(device) |
|
|
|
|
|
print("Model loaded successfully!") |
|
|
|
|
|
tts_text = "दोस्तों इंडिया ने आईसीसी चैंपियंस ट्रॉफी twentytwentyfive जीत ली है! और उसके बाद एसएस राजामौली, रोहित शर्मा की बायोपिक बनाने वाले हैं, जिसमें उनका रोल उनके ही जैसे नयन-नक्श और कद-काठी रखने वाले जूनियर एनटीआर करेंगे।फिल्म में विराट कोहली का रोल रामचरण निभाएंगे, वहीं एमएस धोनी का रोल प्रभास को मिला है, क्योंकि उनके नाम में भी सात लेटर ही आते हैं। " |
|
|
|
speaker_audio_file = "/export/home/vivian/svarah/Karan (happy)-20250311T073926Z-001/Karan (happy)/karan happy emotions new1.90.wav" |
|
|
|
|
|
|
|
|
|
|
|
|
|
lang = "hi" |
|
|
|
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents( |
|
audio_path=speaker_audio_file, |
|
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, |
|
max_ref_length=XTTS_MODEL.config.max_ref_len, |
|
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, |
|
) |
|
|
|
tts_texts = sent_tokenize(tts_text) |
|
num=0 |
|
wav_chunks = [] |
|
pause_duration = 0.4 |
|
sample_rate = 24000 |
|
pause_samples = int(pause_duration * sample_rate) |
|
|
|
for text in tqdm(tts_texts): |
|
num=num+1 |
|
wav_chunk = XTTS_MODEL.inference( |
|
text=text, |
|
language=lang, |
|
gpt_cond_latent=gpt_cond_latent, |
|
speaker_embedding=speaker_embedding, |
|
|
|
|
|
|
|
|
|
|
|
temperature=0.1, |
|
length_penalty=1.0, |
|
repetition_penalty=2.0, |
|
top_k=10, |
|
top_p=0.3, |
|
) |
|
wav_chunks.append(torch.tensor(wav_chunk["wav"])) |
|
|
|
wav_chunks.append(torch.zeros(pause_samples)) |
|
|
|
|
|
out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu() |
|
print(num) |
|
|
|
output_audio_path = "/export/home/vivian/svarah/XTTSv2-Finetuning-for-New-Languages/checkpoints/output_audio.wav" |
|
torchaudio.save(output_audio_path, out_wav, sample_rate=24000) |
|
|
|
print(f"Audio saved to {output_audio_path}") |