Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,12 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import soundfile as sf
|
4 |
from snac import SNAC
|
5 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
def find_last_instance_of_separator(lst, element=50258):
|
10 |
reversed_list = lst[::-1]
|
@@ -55,7 +57,7 @@ def reconstruct_tensors(flattened_output):
|
|
55 |
tensor2.append(flattened_output[i+5])
|
56 |
tensor3.append(flattened_output[i+6])
|
57 |
tensor3.append(flattened_output[i+7])
|
58 |
-
codes = [list_to_torch_tensor(tensor1), list_to_torch_tensor(tensor2), list_to_torch_tensor(tensor3)]
|
59 |
|
60 |
if n_tensors == 15:
|
61 |
for i in range(0, len(flattened_output), 16):
|
@@ -74,20 +76,20 @@ def reconstruct_tensors(flattened_output):
|
|
74 |
tensor3.append(flattened_output[i+13])
|
75 |
tensor4.append(flattened_output[i+14])
|
76 |
tensor4.append(flattened_output[i+15])
|
77 |
-
codes = [list_to_torch_tensor(tensor1), list_to_torch_tensor(tensor2), list_to_torch_tensor(tensor3), list_to_torch_tensor(tensor4)]
|
78 |
|
79 |
return codes
|
80 |
|
81 |
def load_model():
|
82 |
tokenizer = AutoTokenizer.from_pretrained("Lwasinam/voicera-jenny-finetune")
|
83 |
model = AutoModelForCausalLM.from_pretrained("Lwasinam/voicera-jenny-finetune").to(device)
|
84 |
-
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
|
85 |
return model, tokenizer, snac_model
|
86 |
|
87 |
def SpeechDecoder(codes, snac_model):
|
88 |
codes = codes.squeeze(0).tolist()
|
89 |
reconstructed_codes = reconstruct_tensors(codes)
|
90 |
-
audio_hat = snac_model.
|
91 |
audio_path = "reconstructed_audio.wav"
|
92 |
sf.write(audio_path, audio_hat.squeeze().cpu().detach().numpy(), 24000)
|
93 |
return audio_path
|
@@ -117,4 +119,4 @@ iface = gr.Interface(
|
|
117 |
)
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
-
iface.launch(
|
|
|
1 |
+
!pip install nvidia-ml-py3
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
import soundfile as sf
|
5 |
from snac import SNAC
|
6 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
7 |
|
8 |
+
# Ensure the code uses NVIDIA GPUs
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
|
11 |
def find_last_instance_of_separator(lst, element=50258):
|
12 |
reversed_list = lst[::-1]
|
|
|
57 |
tensor2.append(flattened_output[i+5])
|
58 |
tensor3.append(flattened_output[i+6])
|
59 |
tensor3.append(flattened_output[i+7])
|
60 |
+
codes = [list_to_torch_tensor(tensor1).to(device), list_to_torch_tensor(tensor2).to(device), list_to_torch_tensor(tensor3).to(device)]
|
61 |
|
62 |
if n_tensors == 15:
|
63 |
for i in range(0, len(flattened_output), 16):
|
|
|
76 |
tensor3.append(flattened_output[i+13])
|
77 |
tensor4.append(flattened_output[i+14])
|
78 |
tensor4.append(flattened_output[i+15])
|
79 |
+
codes = [list_to_torch_tensor(tensor1).to(device), list_to_torch_tensor(tensor2).to(device), list_to_torch_tensor(tensor3).to(device), list_to_torch_tensor(tensor4).to(device)]
|
80 |
|
81 |
return codes
|
82 |
|
83 |
def load_model():
|
84 |
tokenizer = AutoTokenizer.from_pretrained("Lwasinam/voicera-jenny-finetune")
|
85 |
model = AutoModelForCausalLM.from_pretrained("Lwasinam/voicera-jenny-finetune").to(device)
|
86 |
+
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
|
87 |
return model, tokenizer, snac_model
|
88 |
|
89 |
def SpeechDecoder(codes, snac_model):
|
90 |
codes = codes.squeeze(0).tolist()
|
91 |
reconstructed_codes = reconstruct_tensors(codes)
|
92 |
+
audio_hat = snac_model.decode(reconstructed_codes)
|
93 |
audio_path = "reconstructed_audio.wav"
|
94 |
sf.write(audio_path, audio_hat.squeeze().cpu().detach().numpy(), 24000)
|
95 |
return audio_path
|
|
|
119 |
)
|
120 |
|
121 |
if __name__ == "__main__":
|
122 |
+
iface.launch()
|