import os
import torch
import torch.nn.functional as F
import librosa
import sounddevice as sd
from transformers import WavLMModel
from scipy.io.wavfile import write
from models import SynthesizerTrn
from speaker_encoder.voice_encoder import SpeakerEncoder
import utils
import numpy as np
from transformers import T5Tokenizer, T5EncoderModel
from src.plugin_wrapper import DreamVG
import soundfile as sf


# Load configurations and models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading FreeVC...")
hps = utils.get_hparams_from_file("configs/freevc.json")
freevc = SynthesizerTrn(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).to(device)
freevc.eval()
utils.load_checkpoint("checkpoints/freevc.pth", freevc, None)

print("Loading Speaker Encoder...")
smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')

print("Loading WavLM for content...")
cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)

lm_path = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(lm_path)
text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()

dreamvg = DreamVG(config_path='src/configs/plugin_cross.yaml',
                  ckpt_path='checkpoints/dreamvc_plugin.pt',
                  device=device)


prompt = "girl's voice, very young and cute"
prompt_guidance_scale = 3.0

text_batch = tokenizer(prompt, max_length=32,
                       padding='max_length', truncation=True, return_tensors="pt")
text, text_mask = text_batch.input_ids.to(device), \
    text_batch.attention_mask.to(device)
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
target_embedding = dreamvg.inference([text, text_mask],
                                     guidance_scale=prompt_guidance_scale,
                                     guidance_rescale=0.0,
                                     ddim_steps=100, eta=1,
                                     random_seed=None)

# Convert to tensor and pad
audio, sr = librosa.load('segment_1.mp3', sr=16000)
audio = torch.from_numpy(audio).unsqueeze(0).to(device).float()
audio = F.pad(audio, (40, 40))

# Extract content features using WavLM
c = cmodel(audio).last_hidden_state.transpose(1, 2).to(device)

audio = freevc.infer(c, g=target_embedding)
audio = audio[0][0].data.cpu().float().numpy()

sf.write('freevc_out.wav', audio, 16000)