Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,418 Bytes
364cb51 3a53471 9fbf2d1 08d30fe 9fbf2d1 364cb51 9fbf2d1 364cb51 9fbf2d1 364cb51 a24564e 9fbf2d1 364cb51 9fbf2d1 5e302e0 9fbf2d1 364cb51 9fbf2d1 364cb51 9fbf2d1 a2a8e37 9fbf2d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import torch
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
from snac import SNAC
def redistribute_codes(row):
"""
Convert a sequence of token codes into an audio waveform using SNAC.
The code assumes each 7 tokens represent one group of instructions.
"""
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length]
code_list = [t - 128266 for t in trimmed_row]
layer_1, layer_2, layer_3 = [], [], []
for i in range((len(code_list) + 1) // 7):
layer_1.append(code_list[7 * i][None])
layer_2.append(code_list[7 * i + 1][None] - 4096)
layer_3.append(code_list[7 * i + 2][None] - (2 * 4096))
layer_3.append(code_list[7 * i + 3][None] - (3 * 4096))
layer_2.append(code_list[7 * i + 4][None] - (4 * 4096))
layer_3.append(code_list[7 * i + 5][None] - (5 * 4096))
layer_3.append(code_list[7 * i + 6][None] - (6 * 4096))
with torch.no_grad():
codes = [
torch.concat(layer_1),
torch.concat(layer_2),
torch.concat(layer_3)
]
for i in range(len(codes)):
codes[i][codes[i] < 0] = 0
codes[i] = codes[i][None]
audio_hat = snac_model.decode(codes)
return audio_hat.cpu()[0, 0]
# Load the SNAC model (shared by all)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
# Load all the single-speaker language models
models = {
"Luna": {
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna'),
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Luna', torch_dtype=torch.bfloat16).cuda()
},
"Ceylia": {
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia'),
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia', torch_dtype=torch.bfloat16).cuda()
},
"Cooper": {
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper'),
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Cooper', torch_dtype=torch.bfloat16).cuda()
},
"Jim": {
"tokenizer": AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim'),
"model": AutoModelForCausalLM.from_pretrained('prithivMLmods/Llama-3B-Mono-Jim', torch_dtype=torch.bfloat16).cuda()
},
}
@spaces.GPU
def generate_audio(text, temperature, top_p, max_new_tokens, model_name):
"""
Given input text and model parameters, generate speech audio using the chosen model.
"""
# Retrieve the chosen tokenizer and model
chosen = models[model_name]
tokenizer = chosen["tokenizer"]
model = chosen["model"]
prompt = f'<custom_token_3><|begin_of_text|>{text}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>'
input_ids = tokenizer(prompt, add_special_tokens=False, return_tensors='pt').to('cuda')
with torch.no_grad():
generated_ids = model.generate(
**input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
top_p=top_p,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=128258,
)
row = generated_ids[0, input_ids['input_ids'].shape[1]:]
y_tensor = redistribute_codes(row)
y_np = y_tensor.detach().cpu().numpy()
return (24000, y_np)
# Example texts with emotion tokens
example_texts = [
["Hi, my name is Alex. <laugh> It's a wonderful day! <chuckle> I love coding."],
["I woke up feeling sleepy. <yawn> I need coffee! <sniffle> But I'm ready to work."],
["Oh no, I forgot my keys! <groan> <uhm> Maybe I'll try again later. <sigh>"],
["This is amazing! <gasp> Really, it's fantastic. <giggles>"]
]
# Gradio Interface
with gr.Blocks() as demo:
# Sidebar for model selection
with gr.Sidebar():
gr.Markdown("# Choose Model")
model_choice = gr.Dropdown(choices=list(models.keys()), value="Luna", label="Model")
gr.Markdown("# Single Speaker Audio Generation")
gr.Markdown("Generate speech audio using one of the single-speaker models. Use the examples below to see how emotion tokens like `<laugh>`, `<chuckle>`, `<sigh>`, etc. can be incorporated.")
with gr.Row():
text_input = gr.Textbox(lines=4, label="Input Text")
# Examples with emotion tokens
gr.Examples(
examples=example_texts,
inputs=text_input,
label="Emotion Examples",
cache_examples=False
)
with gr.Row():
temp_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.9, label="Temperature")
top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.8, label="Top-p")
tokens_slider = gr.Slider(minimum=100, maximum=3500, step=50, value=1200, label="Max New Tokens")
output_audio = gr.Audio(type="numpy", label="Generated Audio")
generate_button = gr.Button("Generate Audio")
# Pass the selected model name along with other parameters
generate_button.click(
fn=generate_audio,
inputs=[text_input, temp_slider, top_p_slider, tokens_slider, model_choice],
outputs=output_audio
)
if __name__ == "__main__":
demo.launch() |