|
import os |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import spaces |
|
import torch |
|
import torchaudio |
|
from generator import Segment, load_csm_1b |
|
from huggingface_hub import hf_hub_download, login |
|
from watermarking import watermark |
|
|
|
api_key = os.getenv("HF_TOKEN") |
|
gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60)) |
|
CSM_1B_HF_WATERMARK = list(map(int, os.getenv("WATERMARK_KEY").split(" "))) |
|
|
|
login(token=api_key) |
|
|
|
SPACE_INTRO_TEXT = """\ |
|
# Sesame CSM 1B |
|
|
|
Generate from CSM 1B (Conversational Speech Model). |
|
Code is available on GitHub: [SesameAILabs/csm](https://github.com/SesameAILabs/csm). |
|
Checkpoint is [hosted on HuggingFace](https://huggingface.co/sesame/csm-1b). |
|
|
|
Try out our interactive demo [sesame.com/voicedemo](https://www.sesame.com/voicedemo), |
|
this uses a fine-tuned variant of CSM. |
|
|
|
The model has some capacity for non-English languages due to data contamination in the training |
|
data, but it is likely not to perform well. |
|
|
|
--- |
|
|
|
""" |
|
|
|
CONVO_INTRO_TEXT = """\ |
|
## Conversation content |
|
|
|
Each line is an utterance in the conversation to generate. Speakers alternate between A and B, starting with speaker A. |
|
""" |
|
|
|
DEFAULT_CONVERSATION = """\ |
|
Hey how are you doing. |
|
Pretty good, pretty good. |
|
I'm great, so happy to be speaking to you. |
|
Me too, this is some cool stuff huh? |
|
Yeah, I've been reading more about speech generation, and it really seems like context is important. |
|
Definitely. |
|
""" |
|
|
|
SPEAKER_PROMPTS = { |
|
"conversational_a": { |
|
"text": ( |
|
"like revising for an exam I'd have to try and like keep up the momentum because I'd " |
|
"start really early I'd be like okay I'm gonna start revising now and then like " |
|
"you're revising for ages and then I just like start losing steam I didn't do that " |
|
"for the exam we had recently to be fair that was a more of a last minute scenario " |
|
"but like yeah I'm trying to like yeah I noticed this yesterday that like Mondays I " |
|
"sort of start the day with this not like a panic but like a" |
|
), |
|
"audio": "prompts/conversational_a.wav", |
|
}, |
|
"conversational_b": { |
|
"text": ( |
|
"like a super Mario level. Like it's very like high detail. And like, once you get " |
|
"into the park, it just like, everything looks like a computer game and they have all " |
|
"these, like, you know, if, if there's like a, you know, like in a Mario game, they " |
|
"will have like a question block. And if you like, you know, punch it, a coin will " |
|
"come out. So like everyone, when they come into the park, they get like this little " |
|
"bracelet and then you can go punching question blocks around." |
|
), |
|
"audio": "prompts/conversational_b.wav", |
|
}, |
|
"read_speech_a": { |
|
"text": ( |
|
"And Lake turned round upon me, a little abruptly, his odd yellowish eyes, a little " |
|
"like those of the sea eagle, and the ghost of his smile that flickered on his " |
|
"singularly pale face, with a stern and insidious look, confronted me." |
|
), |
|
"audio": "prompts/read_speech_a.wav", |
|
}, |
|
"read_speech_b": { |
|
"text": ( |
|
"He was such a big boy that he wore high boots and carried a jack knife. He gazed and " |
|
"gazed at the cap, and could not keep from fingering the blue tassel." |
|
), |
|
"audio": "prompts/read_speech_b.wav", |
|
}, |
|
"read_speech_c": { |
|
"text": ( |
|
"All passed so quickly, there was so much going on around him, the Tree quite forgot " |
|
"to look to himself." |
|
), |
|
"audio": "prompts/read_speech_c.wav", |
|
}, |
|
"read_speech_d": { |
|
"text": ( |
|
"Suddenly I was back in the old days Before you felt we ought to drift apart. It was " |
|
"some trick-the way your eyebrows raise." |
|
), |
|
"audio": "prompts/read_speech_d.wav", |
|
}, |
|
} |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt") |
|
generator = load_csm_1b(model_path, device) |
|
|
|
|
|
@spaces.GPU(duration=gpu_timeout) |
|
def infer( |
|
text_prompt_speaker_a, |
|
text_prompt_speaker_b, |
|
audio_prompt_speaker_a, |
|
audio_prompt_speaker_b, |
|
gen_conversation_input, |
|
) -> tuple[np.ndarray, int]: |
|
|
|
if len(gen_conversation_input.strip() + text_prompt_speaker_a.strip() + text_prompt_speaker_b.strip()) >= 2000: |
|
raise gr.Error("Prompts and conversation too long.", duration=30) |
|
|
|
try: |
|
return _infer( |
|
text_prompt_speaker_a, |
|
text_prompt_speaker_b, |
|
audio_prompt_speaker_a, |
|
audio_prompt_speaker_b, |
|
gen_conversation_input, |
|
) |
|
except ValueError as e: |
|
raise gr.Error(f"Error generating audio: {e}", duration=120) |
|
|
|
|
|
def _infer( |
|
text_prompt_speaker_a, |
|
text_prompt_speaker_b, |
|
audio_prompt_speaker_a, |
|
audio_prompt_speaker_b, |
|
gen_conversation_input, |
|
) -> tuple[np.ndarray, int]: |
|
audio_prompt_a = prepare_prompt(text_prompt_speaker_a, 0, audio_prompt_speaker_a) |
|
audio_prompt_b = prepare_prompt(text_prompt_speaker_b, 1, audio_prompt_speaker_b) |
|
|
|
prompt_segments: list[Segment] = [audio_prompt_a, audio_prompt_b] |
|
generated_segments: list[Segment] = [] |
|
|
|
conversation_lines = [line.strip() for line in gen_conversation_input.strip().split("\n") if line.strip()] |
|
for i, line in enumerate(conversation_lines): |
|
|
|
speaker_id = i % 2 |
|
|
|
audio_tensor = generator.generate( |
|
text=line, |
|
speaker=speaker_id, |
|
context=prompt_segments + generated_segments, |
|
max_audio_length_ms=30_000, |
|
) |
|
generated_segments.append(Segment(text=line, speaker=speaker_id, audio=audio_tensor)) |
|
|
|
|
|
audio_tensors = [segment.audio for segment in generated_segments] |
|
audio_tensor = torch.cat(audio_tensors, dim=0) |
|
|
|
|
|
|
|
|
|
|
|
audio_tensor, wm_sample_rate = watermark( |
|
generator._watermarker, audio_tensor, generator.sample_rate, CSM_1B_HF_WATERMARK |
|
) |
|
audio_tensor = torchaudio.functional.resample( |
|
audio_tensor, orig_freq=wm_sample_rate, new_freq=generator.sample_rate |
|
) |
|
|
|
audio_array = (audio_tensor * 32768).to(torch.int16).cpu().numpy() |
|
|
|
return generator.sample_rate, audio_array |
|
|
|
|
|
def prepare_prompt(text: str, speaker: int, audio_path: str) -> Segment: |
|
audio_tensor, _ = load_prompt_audio(audio_path) |
|
return Segment(text=text, speaker=speaker, audio=audio_tensor) |
|
|
|
|
|
def load_prompt_audio(audio_path: str) -> torch.Tensor: |
|
audio_tensor, sample_rate = torchaudio.load(audio_path) |
|
audio_tensor = audio_tensor.squeeze(0) |
|
if sample_rate != generator.sample_rate: |
|
audio_tensor = torchaudio.functional.resample( |
|
audio_tensor, orig_freq=sample_rate, new_freq=generator.sample_rate |
|
) |
|
return audio_tensor, generator.sample_rate |
|
|
|
|
|
def create_speaker_prompt_ui(speaker_name: str): |
|
speaker_dropdown = gr.Dropdown( |
|
choices=list(SPEAKER_PROMPTS.keys()), label="Select a predefined speaker", value=speaker_name |
|
) |
|
with gr.Accordion("Or add your own voice prompt", open=False): |
|
text_prompt_speaker = gr.Textbox(label="Speaker prompt", lines=4, value=SPEAKER_PROMPTS[speaker_name]["text"]) |
|
audio_prompt_speaker = gr.Audio( |
|
label="Speaker prompt", type="filepath", value=SPEAKER_PROMPTS[speaker_name]["audio"] |
|
) |
|
|
|
return speaker_dropdown, text_prompt_speaker, audio_prompt_speaker |
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown(SPACE_INTRO_TEXT) |
|
gr.Markdown("## Voices") |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Speaker A") |
|
speaker_a_dropdown, text_prompt_speaker_a, audio_prompt_speaker_a = create_speaker_prompt_ui( |
|
"conversational_a" |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown("### Speaker B") |
|
speaker_b_dropdown, text_prompt_speaker_b, audio_prompt_speaker_b = create_speaker_prompt_ui( |
|
"conversational_b" |
|
) |
|
|
|
def update_audio(speaker): |
|
if speaker in SPEAKER_PROMPTS: |
|
return SPEAKER_PROMPTS[speaker]["audio"] |
|
return None |
|
|
|
def update_text(speaker): |
|
if speaker in SPEAKER_PROMPTS: |
|
return SPEAKER_PROMPTS[speaker]["text"] |
|
return None |
|
|
|
speaker_a_dropdown.change(fn=update_audio, inputs=[speaker_a_dropdown], outputs=[audio_prompt_speaker_a]) |
|
speaker_b_dropdown.change(fn=update_audio, inputs=[speaker_b_dropdown], outputs=[audio_prompt_speaker_b]) |
|
|
|
speaker_a_dropdown.change(fn=update_text, inputs=[speaker_a_dropdown], outputs=[text_prompt_speaker_a]) |
|
speaker_b_dropdown.change(fn=update_text, inputs=[speaker_b_dropdown], outputs=[text_prompt_speaker_b]) |
|
|
|
gr.Markdown(CONVO_INTRO_TEXT) |
|
|
|
gen_conversation_input = gr.TextArea(label="conversation", lines=20, value=DEFAULT_CONVERSATION) |
|
generate_btn = gr.Button("Generate conversation", variant="primary") |
|
gr.Markdown("GPU time limited to 3 minutes, for longer usage duplicate the space.") |
|
audio_output = gr.Audio(label="Synthesized audio") |
|
|
|
generate_btn.click( |
|
infer, |
|
inputs=[ |
|
text_prompt_speaker_a, |
|
text_prompt_speaker_b, |
|
audio_prompt_speaker_a, |
|
audio_prompt_speaker_b, |
|
gen_conversation_input, |
|
], |
|
outputs=[audio_output], |
|
) |
|
|
|
app.launch(ssr_mode=True) |
|
|