higgs_audio_v2

Running

App Files Files Community

mpasila commited on Jul 23

Commit

d719f2b

verified ·

1 Parent(s): fca1c74

Upload app.py

Browse files

Files changed (1) hide show

app.py +187 -74

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """
 Gradio UI for Text-to-Speech using HiggsAudioServeEngine
-Adapted: Now compatible with Jupyter, Colab, Runpod, etc,
-by adding launch_notebook() and flexible path/context handling.
 """
 import argparse
@@ -18,25 +16,12 @@ from functools import lru_cache
 import re
 import torch
-# --- Safe import or stub for 'spaces' (for Huggingface Space only) ---
-try:
-    import spaces
-except ImportError:
-    class DummySpaces:
-        def __getattr__(self, name):  # any decorator
-            return lambda *a, **k: (lambda f: f)
-    spaces = DummySpaces()
 # Import HiggsAudio components
 from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
 from higgs_audio.data_types import ChatMLSample, AudioContent, Message
-# --- Add this for Colab/notebook path safety ---
-BASE_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
-# Global engine/voice instance
 engine = None
-VOICE_PRESETS = {}
 # Default model configuration
 DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
@@ -52,17 +37,62 @@ DEFAULT_SYSTEM_PROMPT = (
 DEFAULT_STOP_STRINGS = ["<|end_of_text|>", "<|eot_id|>"]
-# ... PREDEFINED_EXAMPLES as before ...
-# (copy unchanged; omitted for brevity in this answer but use your full PREDEFINED_EXAMPLES dictionary)
 PREDEFINED_EXAMPLES = {
-    # ... Same as your long dict above ...
-    # (copy full version from original)
-    # (you can copy exactly as in your current app.py)
 }
-# -- The rest of your code, but replacing path joins to use BASE_DIR instead of __file__! ---
 @lru_cache(maxsize=20)
 def encode_audio_file(file_path):
@@ -70,15 +100,17 @@ def encode_audio_file(file_path):
     with open(file_path, "rb") as audio_file:
         return base64.b64encode(audio_file.read()).decode("utf-8")
 def get_current_device():
     """Get the current device."""
     return "cuda" if torch.cuda.is_available() else "cpu"
 def load_voice_presets():
     """Load the voice presets from the voice_examples directory."""
     try:
         with open(
-            os.path.join(BASE_DIR, "voice_examples", "config.json"),
             "r",
         ) as f:
             voice_dict = json.load(f)
@@ -93,9 +125,10 @@ def load_voice_presets():
         logger.error(f"Error loading voice presets: {e}")
         return {"EMPTY": "No reference voice"}
 def get_voice_preset(voice_preset):
     """Get the voice path and text for a given voice preset."""
-    voice_path = os.path.join(BASE_DIR, "voice_examples", f"{voice_preset}.wav")
     if not os.path.exists(voice_path):
         logger.warning(f"Voice preset file not found: {voice_path}")
         return None, "Voice preset not found"
@@ -103,24 +136,54 @@ def get_voice_preset(voice_preset):
     text = VOICE_PRESETS.get(voice_preset, "No transcript available")
     return voice_path, text
-# -- rest of your normalization and utility code unchanged --
 def normalize_chinese_punctuation(text):
-    # ... as before ...
     chinese_to_english_punct = {
-        # ... as before ...
     }
     for zh_punct, en_punct in chinese_to_english_punct.items():
         text = text.replace(zh_punct, en_punct)
     return text
 def normalize_text(transcript: str):
-    # ... as before, unchanged ...
     transcript = normalize_chinese_punctuation(transcript)
     transcript = transcript.replace("(", " ")
     transcript = transcript.replace(")", " ")
     transcript = transcript.replace("°F", " degrees Fahrenheit")
     transcript = transcript.replace("°C", " degrees Celsius")
     for tag, replacement in [
         ("[laugh]", "<SE>[Laughter]</SE>"),
         ("[humming start]", "<SE>[Humming]</SE>"),
@@ -135,15 +198,17 @@ def normalize_text(transcript: str):
         ("[cough]", "<SE>[Cough]</SE>"),
     ]:
         transcript = transcript.replace(tag, replacement)
-    # ... rest unchanged ...
     lines = transcript.split("\n")
     transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
     transcript = transcript.strip()
     if not any([transcript.endswith(c) for c in [".", "!", "?", ",", ";", '"', "'", "</SE_e>", "</SE>"]]):
         transcript += "."
     return transcript
-@spaces.GPU
 def initialize_engine(model_path, audio_tokenizer_path) -> bool:
     """Initialize the HiggsAudioServeEngine."""
     global engine
@@ -160,14 +225,19 @@ def initialize_engine(model_path, audio_tokenizer_path) -> bool:
         logger.error(f"Failed to initialize engine: {e}")
         return False
 def check_return_audio(audio_wv: np.ndarray):
     if np.all(audio_wv == 0):
         logger.warning("Audio is silent, returning None")
 def process_text_output(text_output: str):
     text_output = re.sub(r"(<\|AUDIO_OUT\|>)+", r"<|AUDIO_OUT|>", text_output)
     return text_output
 def prepare_chatml_sample(
     voice_preset: str,
     text: str,
@@ -175,29 +245,45 @@ def prepare_chatml_sample(
     reference_text: Optional[str] = None,
     system_prompt: str = DEFAULT_SYSTEM_PROMPT,
 ):
     messages = []
     if len(system_prompt) > 0:
         messages.append(Message(role="system", content=system_prompt))
     audio_base64 = None
     ref_text = ""
     if reference_audio:
         audio_base64 = encode_audio_file(reference_audio)
         ref_text = reference_text or ""
     elif voice_preset != "EMPTY":
         voice_path, ref_text = get_voice_preset(voice_preset)
         if voice_path is None:
             logger.warning(f"Voice preset {voice_preset} not found, skipping reference audio")
         else:
             audio_base64 = encode_audio_file(voice_path)
     if audio_base64 is not None:
         messages.append(Message(role="user", content=ref_text))
         audio_content = AudioContent(raw_audio=audio_base64, audio_url="")
         messages.append(Message(role="assistant", content=[audio_content]))
     text = normalize_text(text)
     messages.append(Message(role="user", content=text))
     return ChatMLSample(messages=messages)
-@spaces.GPU(duration=120)
 def text_to_speech(
     text,
     voice_preset,
@@ -212,15 +298,22 @@ def text_to_speech(
     ras_win_len=7,
     ras_win_max_num_repeat=2,
 ):
     global engine
     if engine is None:
         initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
     try:
         chatml_sample = prepare_chatml_sample(voice_preset, text, reference_audio, reference_text, system_prompt)
         if stop_strings is None:
             stop_list = DEFAULT_STOP_STRINGS
         else:
             stop_list = [s for s in stop_strings["stops"] if s.strip()]
         request_id = f"tts-playground-{str(uuid.uuid4())}"
         logger.info(
             f"{request_id}: Generating speech for text: {text[:100]}..., \n"
@@ -228,6 +321,8 @@ def text_to_speech(
             f"ras_win_len={ras_win_len}, ras_win_max_num_repeat={ras_win_max_num_repeat}"
         )
         start_time = time.time()
         response = engine.generate(
             chat_ml_sample=chatml_sample,
             max_new_tokens=max_completion_tokens,
@@ -238,25 +333,34 @@ def text_to_speech(
             ras_win_len=ras_win_len if ras_win_len > 0 else None,
             ras_win_max_num_repeat=max(ras_win_len, ras_win_max_num_repeat),
         )
         generation_time = time.time() - start_time
         logger.info(f"{request_id}: Generated audio in {generation_time:.3f} seconds")
         gr.Info(f"Generated audio in {generation_time:.3f} seconds")
         text_output = process_text_output(response.generated_text)
         if response.audio is not None:
             audio_data = (response.audio * 32767).astype(np.int16)
             check_return_audio(audio_data)
             return text_output, (response.sampling_rate, audio_data)
         else:
             logger.warning("No audio generated")
             return text_output, None
     except Exception as e:
         error_msg = f"Error generating speech: {e}"
         logger.error(error_msg)
         gr.Error(error_msg)
         return f"❌ {error_msg}", None
 def create_ui():
-    my_theme = gr.Theme.load(os.path.join(BASE_DIR, "theme.json"))
     custom_css = """
     .gradio-container input:focus,
     .gradio-container textarea:focus,
@@ -272,6 +376,8 @@ def create_ui():
         outline: none !important;
         background-color: var(--input-background-fill) !important;
     }
     .gradio-container input:hover,
     .gradio-container textarea:hover,
     .gradio-container select:hover,
@@ -281,45 +387,59 @@ def create_ui():
         border-color: var(--border-color-primary) !important;
         background-color: var(--input-background-fill) !important;
     }
     .gradio-container input[type="checkbox"]:checked {
         background-color: var(--primary-500) !important;
         border-color: var(--primary-500) !important;
     }
     """
     default_template = "smart-voice"
     with gr.Blocks(theme=my_theme, css=custom_css) as demo:
         gr.Markdown("# Higgs Audio Text-to-Speech Playground")
         with gr.Row():
             with gr.Column(scale=2):
                 template_dropdown = gr.Dropdown(
                     label="TTS Template",
                     choices=list(PREDEFINED_EXAMPLES.keys()),
                     value=default_template,
                     info="Select a predefined example for system and input messages.",
                 )
                 template_description = gr.HTML(
                     value=f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {PREDEFINED_EXAMPLES[default_template]["description"]}</p>',
                     visible=True,
                 )
                 system_prompt = gr.TextArea(
                     label="System Prompt",
                     placeholder="Enter system prompt to guide the model...",
                     value=PREDEFINED_EXAMPLES[default_template]["system_prompt"],
                     lines=2,
                 )
                 input_text = gr.TextArea(
                     label="Input Text",
                     placeholder="Type the text you want to convert to speech...",
                     value=PREDEFINED_EXAMPLES[default_template]["input_text"],
                     lines=5,
                 )
                 voice_preset = gr.Dropdown(
                     label="Voice Preset",
                     choices=list(VOICE_PRESETS.keys()),
                     value="EMPTY",
-                    interactive=False,
                     visible=False,
                 )
                 with gr.Accordion(
                     "Custom Reference (Optional)", open=False, visible=False
                 ) as custom_reference_accordion:
@@ -329,6 +449,7 @@ def create_ui():
                         placeholder="Enter the transcript of your reference audio...",
                         lines=3,
                     )
                 with gr.Accordion("Advanced Parameters", open=False):
                     max_completion_tokens = gr.Slider(
                         minimum=128,
@@ -362,6 +483,7 @@ def create_ui():
                         label="RAS Max Num Repeat",
                         info="Maximum number of repetitions allowed in the window",
                     )
                     stop_strings = gr.Dataframe(
                         label="Stop Strings",
                         headers=["stops"],
@@ -370,11 +492,18 @@ def create_ui():
                         interactive=True,
                         col_count=(1, "fixed"),
                     )
                 submit_btn = gr.Button("Generate Speech", variant="primary", scale=1)
             with gr.Column(scale=2):
                 output_text = gr.TextArea(label="Model Response", lines=2)
                 output_audio = gr.Audio(label="Generated Audio", interactive=False, autoplay=True)
                 stop_btn = gr.Button("Stop Playback", variant="primary")
         with gr.Row(visible=False) as voice_samples_section:
             voice_samples_table = gr.Dataframe(
                 headers=["Voice Preset", "Sample Text"],
@@ -384,8 +513,10 @@ def create_ui():
             )
             sample_audio = gr.Audio(label="Voice Sample")
         def play_voice_sample(evt: gr.SelectData):
             try:
                 preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
                 if evt.index[0] < len(preset_names):
                     preset = preset_names[evt.index[0]]
@@ -405,11 +536,14 @@ def create_ui():
         voice_samples_table.select(fn=play_voice_sample, outputs=[sample_audio])
         def apply_template(template_name):
             if template_name in PREDEFINED_EXAMPLES:
                 template = PREDEFINED_EXAMPLES[template_name]
                 is_voice_clone = template_name == "voice-clone"
                 voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
                 ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
                 description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
                 return (
@@ -418,10 +552,10 @@ def create_ui():
                     description_text,  # template_description
                     gr.update(
                         value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
-                    ),
-                    gr.update(visible=is_voice_clone),
-                    gr.update(visible=is_voice_clone),
-                    ras_win_len_value,
                 )
             else:
                 return (
@@ -432,8 +566,11 @@ def create_ui():
                     gr.update(),
                     gr.update(),
                     gr.update(),
-                )
         template_dropdown.change(
             fn=apply_template,
             inputs=[template_dropdown],
@@ -448,6 +585,7 @@ def create_ui():
             ],
         )
         submit_btn.click(
             fn=text_to_speech,
             inputs=[
@@ -467,50 +605,20 @@ def create_ui():
             outputs=[output_text, output_audio],
             api_name="generate_speech",
         )
         stop_btn.click(
             fn=lambda: None,
             inputs=[],
             outputs=[output_audio],
             js="() => {const audio = document.querySelector('audio'); if(audio) audio.pause(); return null;}",
         )
-    return demo
-# ------ NEW! Notebook/Colab/Runpod Launch Function ------
-def launch_notebook(
-    model_path=DEFAULT_MODEL_PATH,
-    audio_tokenizer_path=DEFAULT_AUDIO_TOKENIZER_PATH,
-    device=None,
-    host="127.0.0.1",
-    port=7860,
-    inline=True,
-    share=False,
-    **gradio_kwargs
-):
-    """
-    Launch the Gradio UI inside a notebook, Colab or script.
-    - If inline=True (default), embeds in cell (Jupyter/Colab/Runpod, etc).
-    - If share=True, Gradio will provide a public URL for the UI.
-    """
-    global VOICE_PRESETS
-    VOICE_PRESETS = load_voice_presets()
-    # Optionally initialize engine, or let it lazy init on first use
-    # initialize_engine(model_path, audio_tokenizer_path)
-    demo = create_ui()
-    # Note: You can also pass other gradio launch kwargs here if desired.
-    demo.launch(
-        server_name=host,
-        server_port=port,
-        inline=inline,
-        share=share,
-        **gradio_kwargs,
-    )
 def main():
-    """
-    Main function to parse arguments and launch the UI via CLI (notebooks should use launch_notebook()).
-    """
     global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
     parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
@@ -525,9 +633,14 @@ def main():
     parser.add_argument("--port", type=int, default=7860, help="Port for the Gradio interface.")
     args = parser.parse_args()
     VOICE_PRESETS = load_voice_presets()
     demo = create_ui()
     demo.launch(server_name=args.host, server_port=args.port)
 if __name__ == "__main__":
     main()

 """
 Gradio UI for Text-to-Speech using HiggsAudioServeEngine
 """
 import argparse
 import re
 import torch
 # Import HiggsAudio components
 from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
 from higgs_audio.data_types import ChatMLSample, AudioContent, Message
+# Global engine instance
 engine = None
 # Default model configuration
 DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
 DEFAULT_STOP_STRINGS = ["<|end_of_text|>", "<|eot_id|>"]
+# Predefined examples for system and input messages
 PREDEFINED_EXAMPLES = {
+    "voice-clone": {
+        "system_prompt": "",
+        "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
+        "description": "Voice clone to clone the reference audio. Leave the system prompt empty.",
+    },
+    "smart-voice": {
+        "system_prompt": DEFAULT_SYSTEM_PROMPT,
+        "input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
+        "description": "Smart voice to generate speech based on the context",
+    },
+    "multispeaker-voice-description": {
+        "system_prompt": "You are an AI assistant designed to convert text into speech.\n"
+        "If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.\n"
+        "If no speaker tag is present, select a suitable voice on your own.\n\n"
+        "<|scene_desc_start|>\n"
+        "SPEAKER0: feminine\n"
+        "SPEAKER1: masculine\n"
+        "<|scene_desc_end|>",
+        "input_text": "[SPEAKER0] I can't believe you did that without even asking me first!\n"
+        "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
+        "[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
+        "[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
+        "description": "Multispeaker with different voice descriptions in the system prompt",
+    },
+    "single-speaker-voice-description": {
+        "system_prompt": "Generate audio following instruction.\n\n"
+        "<|scene_desc_start|>\n"
+        "SPEAKER0: He speaks with a clear British accent and a conversational, inquisitive tone. His delivery is articulate and at a moderate pace, and very clear audio.\n"
+        "<|scene_desc_end|>",
+        "input_text": "Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
+        "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
+        "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
+        "\n"
+        "So here's the big question: Do you want to understand how deep learning works?\n",
+        "description": "Single speaker with voice description in the system prompt",
+    },
+    "single-speaker-zh": {
+        "system_prompt": "Generate audio following instruction.\n\n"
+        "<|scene_desc_start|>\n"
+        "Audio is recorded from a quiet room.\n"
+        "<|scene_desc_end|>",
+        "input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
+        "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
+        "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
+        "或者说, 你能察觉到我其实是个机器人吗?",
+        "description": "Single speaker speaking Chinese",
+    },
+    "single-speaker-bgm": {
+        "system_prompt": DEFAULT_SYSTEM_PROMPT,
+        "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
+        "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
+    },
 }
 @lru_cache(maxsize=20)
 def encode_audio_file(file_path):
     with open(file_path, "rb") as audio_file:
         return base64.b64encode(audio_file.read()).decode("utf-8")
 def get_current_device():
     """Get the current device."""
     return "cuda" if torch.cuda.is_available() else "cpu"
 def load_voice_presets():
     """Load the voice presets from the voice_examples directory."""
     try:
         with open(
+            os.path.join(os.path.dirname(__file__), "voice_examples", "config.json"),
             "r",
         ) as f:
             voice_dict = json.load(f)
         logger.error(f"Error loading voice presets: {e}")
         return {"EMPTY": "No reference voice"}
 def get_voice_preset(voice_preset):
     """Get the voice path and text for a given voice preset."""
+    voice_path = os.path.join(os.path.dirname(__file__), "voice_examples", f"{voice_preset}.wav")
     if not os.path.exists(voice_path):
         logger.warning(f"Voice preset file not found: {voice_path}")
         return None, "Voice preset not found"
     text = VOICE_PRESETS.get(voice_preset, "No transcript available")
     return voice_path, text
 def normalize_chinese_punctuation(text):
+    """
+    Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
+    """
+    # Mapping of Chinese punctuation to English punctuation
     chinese_to_english_punct = {
+        "，": ", ",  # comma
+        "。": ".",  # period
+        "：": ":",  # colon
+        "；": ";",  # semicolon
+        "？": "?",  # question mark
+        "！": "!",  # exclamation mark
+        "（": "(",  # left parenthesis
+        "）": ")",  # right parenthesis
+        "【": "[",  # left square bracket
+        "】": "]",  # right square bracket
+        "《": "<",  # left angle quote
+        "》": ">",  # right angle quote
+        "“": '"',  # left double quotation
+        "”": '"',  # right double quotation
+        "‘": "'",  # left single quotation
+        "’": "'",  # right single quotation
+        "、": ",",  # enumeration comma
+        "—": "-",  # em dash
+        "…": "...",  # ellipsis
+        "·": ".",  # middle dot
+        "「": '"',  # left corner bracket
+        "」": '"',  # right corner bracket
+        "『": '"',  # left double corner bracket
+        "』": '"',  # right double corner bracket
     }
+    # Replace each Chinese punctuation with its English counterpart
     for zh_punct, en_punct in chinese_to_english_punct.items():
         text = text.replace(zh_punct, en_punct)
     return text
 def normalize_text(transcript: str):
     transcript = normalize_chinese_punctuation(transcript)
+    # Other normalizations (e.g., parentheses and other symbols. Will be improved in the future)
     transcript = transcript.replace("(", " ")
     transcript = transcript.replace(")", " ")
     transcript = transcript.replace("°F", " degrees Fahrenheit")
     transcript = transcript.replace("°C", " degrees Celsius")
     for tag, replacement in [
         ("[laugh]", "<SE>[Laughter]</SE>"),
         ("[humming start]", "<SE>[Humming]</SE>"),
         ("[cough]", "<SE>[Cough]</SE>"),
     ]:
         transcript = transcript.replace(tag, replacement)
     lines = transcript.split("\n")
     transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
     transcript = transcript.strip()
     if not any([transcript.endswith(c) for c in [".", "!", "?", ",", ";", '"', "'", "</SE_e>", "</SE>"]]):
         transcript += "."
     return transcript
 def initialize_engine(model_path, audio_tokenizer_path) -> bool:
     """Initialize the HiggsAudioServeEngine."""
     global engine
         logger.error(f"Failed to initialize engine: {e}")
         return False
 def check_return_audio(audio_wv: np.ndarray):
+    # check if the audio returned is all silent
     if np.all(audio_wv == 0):
         logger.warning("Audio is silent, returning None")
 def process_text_output(text_output: str):
+    # remove all the continuous <|AUDIO_OUT|> tokens with a single <|AUDIO_OUT|>
     text_output = re.sub(r"(<\|AUDIO_OUT\|>)+", r"<|AUDIO_OUT|>", text_output)
     return text_output
 def prepare_chatml_sample(
     voice_preset: str,
     text: str,
     reference_text: Optional[str] = None,
     system_prompt: str = DEFAULT_SYSTEM_PROMPT,
 ):
+    """Prepare a ChatMLSample for the HiggsAudioServeEngine."""
     messages = []
+    # Add system message if provided
     if len(system_prompt) > 0:
         messages.append(Message(role="system", content=system_prompt))
+    # Add reference audio if provided
     audio_base64 = None
     ref_text = ""
     if reference_audio:
+        # Custom reference audio
         audio_base64 = encode_audio_file(reference_audio)
         ref_text = reference_text or ""
     elif voice_preset != "EMPTY":
+        # Voice preset
         voice_path, ref_text = get_voice_preset(voice_preset)
         if voice_path is None:
             logger.warning(f"Voice preset {voice_preset} not found, skipping reference audio")
         else:
             audio_base64 = encode_audio_file(voice_path)
+    # Only add reference audio if we have it
     if audio_base64 is not None:
+        # Add user message with reference text
         messages.append(Message(role="user", content=ref_text))
+        # Add assistant message with audio content
         audio_content = AudioContent(raw_audio=audio_base64, audio_url="")
         messages.append(Message(role="assistant", content=[audio_content]))
+    # Add the main user message
     text = normalize_text(text)
     messages.append(Message(role="user", content=text))
     return ChatMLSample(messages=messages)
 def text_to_speech(
     text,
     voice_preset,
     ras_win_len=7,
     ras_win_max_num_repeat=2,
 ):
+    """Convert text to speech using HiggsAudioServeEngine."""
     global engine
     if engine is None:
         initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
     try:
+        # Prepare ChatML sample
         chatml_sample = prepare_chatml_sample(voice_preset, text, reference_audio, reference_text, system_prompt)
+        # Convert stop strings format
         if stop_strings is None:
             stop_list = DEFAULT_STOP_STRINGS
         else:
             stop_list = [s for s in stop_strings["stops"] if s.strip()]
         request_id = f"tts-playground-{str(uuid.uuid4())}"
         logger.info(
             f"{request_id}: Generating speech for text: {text[:100]}..., \n"
             f"ras_win_len={ras_win_len}, ras_win_max_num_repeat={ras_win_max_num_repeat}"
         )
         start_time = time.time()
+        # Generate using the engine
         response = engine.generate(
             chat_ml_sample=chatml_sample,
             max_new_tokens=max_completion_tokens,
             ras_win_len=ras_win_len if ras_win_len > 0 else None,
             ras_win_max_num_repeat=max(ras_win_len, ras_win_max_num_repeat),
         )
         generation_time = time.time() - start_time
         logger.info(f"{request_id}: Generated audio in {generation_time:.3f} seconds")
         gr.Info(f"Generated audio in {generation_time:.3f} seconds")
+        # Process the response
         text_output = process_text_output(response.generated_text)
         if response.audio is not None:
+            # Convert to int16 for Gradio
             audio_data = (response.audio * 32767).astype(np.int16)
             check_return_audio(audio_data)
             return text_output, (response.sampling_rate, audio_data)
         else:
             logger.warning("No audio generated")
             return text_output, None
     except Exception as e:
         error_msg = f"Error generating speech: {e}"
         logger.error(error_msg)
         gr.Error(error_msg)
         return f"❌ {error_msg}", None
 def create_ui():
+    my_theme = gr.Theme.load("theme.json")
+    # Add custom CSS to disable focus highlighting on textboxes
     custom_css = """
     .gradio-container input:focus,
     .gradio-container textarea:focus,
         outline: none !important;
         background-color: var(--input-background-fill) !important;
     }
+    /* Override any hover effects as well */
     .gradio-container input:hover,
     .gradio-container textarea:hover,
     .gradio-container select:hover,
         border-color: var(--border-color-primary) !important;
         background-color: var(--input-background-fill) !important;
     }
+    /* Style for checked checkbox */
     .gradio-container input[type="checkbox"]:checked {
         background-color: var(--primary-500) !important;
         border-color: var(--primary-500) !important;
     }
     """
     default_template = "smart-voice"
+    """Create the Gradio UI."""
     with gr.Blocks(theme=my_theme, css=custom_css) as demo:
         gr.Markdown("# Higgs Audio Text-to-Speech Playground")
+        # Main UI section
         with gr.Row():
             with gr.Column(scale=2):
+                # Template selection dropdown
                 template_dropdown = gr.Dropdown(
                     label="TTS Template",
                     choices=list(PREDEFINED_EXAMPLES.keys()),
                     value=default_template,
                     info="Select a predefined example for system and input messages.",
                 )
+                # Template description display
                 template_description = gr.HTML(
                     value=f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {PREDEFINED_EXAMPLES[default_template]["description"]}</p>',
                     visible=True,
                 )
                 system_prompt = gr.TextArea(
                     label="System Prompt",
                     placeholder="Enter system prompt to guide the model...",
                     value=PREDEFINED_EXAMPLES[default_template]["system_prompt"],
                     lines=2,
                 )
                 input_text = gr.TextArea(
                     label="Input Text",
                     placeholder="Type the text you want to convert to speech...",
                     value=PREDEFINED_EXAMPLES[default_template]["input_text"],
                     lines=5,
                 )
                 voice_preset = gr.Dropdown(
                     label="Voice Preset",
                     choices=list(VOICE_PRESETS.keys()),
                     value="EMPTY",
+                    interactive=False,  # Disabled by default since default template is not voice-clone
                     visible=False,
                 )
                 with gr.Accordion(
                     "Custom Reference (Optional)", open=False, visible=False
                 ) as custom_reference_accordion:
                         placeholder="Enter the transcript of your reference audio...",
                         lines=3,
                     )
                 with gr.Accordion("Advanced Parameters", open=False):
                     max_completion_tokens = gr.Slider(
                         minimum=128,
                         label="RAS Max Num Repeat",
                         info="Maximum number of repetitions allowed in the window",
                     )
+                    # Add stop strings component
                     stop_strings = gr.Dataframe(
                         label="Stop Strings",
                         headers=["stops"],
                         interactive=True,
                         col_count=(1, "fixed"),
                     )
                 submit_btn = gr.Button("Generate Speech", variant="primary", scale=1)
             with gr.Column(scale=2):
                 output_text = gr.TextArea(label="Model Response", lines=2)
+                # Audio output
                 output_audio = gr.Audio(label="Generated Audio", interactive=False, autoplay=True)
                 stop_btn = gr.Button("Stop Playback", variant="primary")
+        # Example voice
         with gr.Row(visible=False) as voice_samples_section:
             voice_samples_table = gr.Dataframe(
                 headers=["Voice Preset", "Sample Text"],
             )
             sample_audio = gr.Audio(label="Voice Sample")
+        # Function to play voice sample when clicking on a row
         def play_voice_sample(evt: gr.SelectData):
             try:
+                # Get the preset name from the clicked row
                 preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
                 if evt.index[0] < len(preset_names):
                     preset = preset_names[evt.index[0]]
         voice_samples_table.select(fn=play_voice_sample, outputs=[sample_audio])
+        # Function to handle template selection
         def apply_template(template_name):
             if template_name in PREDEFINED_EXAMPLES:
                 template = PREDEFINED_EXAMPLES[template_name]
+                # Enable voice preset and custom reference only for voice-clone template
                 is_voice_clone = template_name == "voice-clone"
                 voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
+                # Set ras_win_len to 0 for single-speaker-bgm, 7 for others
                 ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
                 description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
                 return (
                     description_text,  # template_description
                     gr.update(
                         value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
+                    ),  # voice_preset (value and interactivity)
+                    gr.update(visible=is_voice_clone),  # custom reference accordion visibility
+                    gr.update(visible=is_voice_clone),  # voice samples section visibility
+                    ras_win_len_value,  # ras_win_len
                 )
             else:
                 return (
                     gr.update(),
                     gr.update(),
                     gr.update(),
+                )  # No change if template not found
+        # Set up event handlers
+        # Connect template dropdown to handler
         template_dropdown.change(
             fn=apply_template,
             inputs=[template_dropdown],
             ],
         )
+        # Connect submit button to the TTS function
         submit_btn.click(
             fn=text_to_speech,
             inputs=[
             outputs=[output_text, output_audio],
             api_name="generate_speech",
         )
+        # Stop button functionality
         stop_btn.click(
             fn=lambda: None,
             inputs=[],
             outputs=[output_audio],
             js="() => {const audio = document.querySelector('audio'); if(audio) audio.pause(); return null;}",
         )
+    return demo
 def main():
+    """Main function to parse arguments and launch the UI."""
     global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
     parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
     parser.add_argument("--port", type=int, default=7860, help="Port for the Gradio interface.")
     args = parser.parse_args()
+    # Update default values if provided via command line
     VOICE_PRESETS = load_voice_presets()
+    # Create and launch the UI
     demo = create_ui()
     demo.launch(server_name=args.host, server_port=args.port)
 if __name__ == "__main__":
     main()