Spaces:
Sleeping
Sleeping
File size: 1,342 Bytes
357cae7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# %%
import time
from IPython.display import Audio
import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio
import torch
# from transformers import pipeline
from transformers import SeamlessM4Tv2Model
from transformers import AutoProcessor
model_name = "facebook/seamless-m4t-v2-large"
# model_name = "facebook/hf-seamless-m4t-medium"
processor = AutoProcessor.from_pretrained(model_name)
model = SeamlessM4Tv2Model.from_pretrained(model_name)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model.to(device)
start_time = time.time()
src_lang = "eng"
tgt_lang = "por"
text_to_translate = "My life is a beautifull thing"
text_inputs = processor(text=text_to_translate,
src_lang=src_lang, return_tensors="pt").to(device)
# output_tokens = model.generate(
# **text_inputs, tgt_lang=tgt_lang, generate_speech=False)
# translated_text_from_text = processor.decode(
# output_tokens[0].tolist()[0], skip_special_tokens=True)
# %%
print(text_inputs)
# %%
audio_array_from_text = model.generate(
**text_inputs, tgt_lang=tgt_lang)[0].cpu().numpy().squeeze()
# %%
print(audio_array_from_text)
# %%
a = Audio(audio_array_from_text, rate=model.config.sampling_rate)
print(a)
# %%
|