# %%
import time
from IPython.display import Audio
import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio

import torch
# from transformers import pipeline

from transformers import SeamlessM4Tv2Model
from transformers import AutoProcessor

model_name = "facebook/seamless-m4t-v2-large"
# model_name = "facebook/hf-seamless-m4t-medium"

processor = AutoProcessor.from_pretrained(model_name)
model = SeamlessM4Tv2Model.from_pretrained(model_name)


device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model.to(device)

start_time = time.time()

src_lang = "eng"
tgt_lang = "por"

text_to_translate = "My life is a beautifull thing"

text_inputs = processor(text=text_to_translate,
                        src_lang=src_lang, return_tensors="pt").to(device)

# output_tokens = model.generate(
#     **text_inputs, tgt_lang=tgt_lang, generate_speech=False)

# translated_text_from_text = processor.decode(
#     output_tokens[0].tolist()[0], skip_special_tokens=True)

# %%
print(text_inputs)

# %%
audio_array_from_text = model.generate(
    **text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()

# %%
print(audio_array_from_text)

# %%

a = Audio(audio_array_from_text, rate=model.config.sampling_rate)

print(a)

# %%