Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on Mar 3

Commit

89ed0ae

1 Parent(s): 8573823

implement multi-round for voice chat

Browse files

Files changed (5) hide show

src/content/agent.py +3 -2
src/content/common.py +37 -9
src/content/playground.py +4 -3
src/content/voice_chat.py +79 -42
src/generation.py +39 -0

src/content/agent.py CHANGED Viewed

@@ -9,10 +9,11 @@ from src.utils import bytes_to_array, array_to_bytes
 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
-    DEFAULT_DIALOGUE_STATES,
     init_state_section,
     header_section,
     sidebar_fragment,
     retrive_response_with_ui
 )
@@ -132,7 +133,7 @@ def bottom_input_section():
         st.button(
             'Clear',
             disabled=st.session_state.disprompt,
-            on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
         )
     with bottom_cols[1]:

 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
+    AGENT_DIALOGUE_STATES,
     init_state_section,
     header_section,
     sidebar_fragment,
+    reset_states,
     retrive_response_with_ui
 )
         st.button(
             'Clear',
             disabled=st.session_state.disprompt,
+            on_click=lambda: reset_states(AGENT_DIALOGUE_STATES)
         )
     with bottom_cols[1]:

src/content/common.py CHANGED Viewed

@@ -13,20 +13,33 @@ from src.retrieval import load_retriever
 from src.logger import load_logger
-DEFAULT_DIALOGUE_STATES = dict(
     pg_audio_base64='',
     pg_audio_array=np.array([]),
-    pg_messages=[],
     vc_audio_base64='',
     vc_audio_array=np.array([]),
-    vc_messages=[],
     ag_audio_base64='',
     ag_audio_array=np.array([]),
     ag_visited_query_indices=[],
     ag_messages=[],
-    ag_model_messages=[],
-    disprompt = False,
-    new_prompt = "",
     on_select=False,
     on_upload=False,
     on_record=False,
@@ -34,6 +47,14 @@ DEFAULT_DIALOGUE_STATES = dict(
 )
 MODEL_NAMES = OrderedDict({})
@@ -329,9 +350,10 @@ def init_state_section():
         if key not in st.session_state:
             st.session_state[key]=copy.deepcopy(value)
-    for key, value in DEFAULT_DIALOGUE_STATES.items():
-        if key not in st.session_state:
-            st.session_state[key]=copy.deepcopy(value)
 def header_section(component_name, description="", concise_description="", icon="🤖"):
@@ -375,6 +397,12 @@ def sidebar_fragment():
     st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")
 def retrive_response_with_ui(
         model_name: str,
         text_input: str,

 from src.logger import load_logger
+PLAYGROUND_DIALOGUE_STATES = dict(
     pg_audio_base64='',
     pg_audio_array=np.array([]),
+    pg_messages=[]
+)
+VOICE_CHAT_DIALOGUE_STATES = dict(
     vc_audio_base64='',
     vc_audio_array=np.array([]),
+    vc_messages=[],
+    vc_model_messages=[]
+)
+AGENT_DIALOGUE_STATES = dict(
     ag_audio_base64='',
     ag_audio_array=np.array([]),
     ag_visited_query_indices=[],
     ag_messages=[],
+    ag_model_messages=[]
+)
+COMMON_DIALOGUE_STATES = dict(
+    disprompt=False,
+    new_prompt="",
     on_select=False,
     on_upload=False,
     on_record=False,
 )
+DEFAULT_DIALOGUE_STATE_DICTS = [
+    PLAYGROUND_DIALOGUE_STATES,
+    VOICE_CHAT_DIALOGUE_STATES,
+    AGENT_DIALOGUE_STATES,
+    COMMON_DIALOGUE_STATES
+]
 MODEL_NAMES = OrderedDict({})
         if key not in st.session_state:
             st.session_state[key]=copy.deepcopy(value)
+    for states in DEFAULT_DIALOGUE_STATE_DICTS:
+        for key, value in states.items():
+            if key not in st.session_state:
+                st.session_state[key]=copy.deepcopy(value)
 def header_section(component_name, description="", concise_description="", icon="🤖"):
     st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")
+def reset_states(*state_dicts):
+    for states in state_dicts:
+        st.session_state.update(copy.deepcopy(states))
+    st.session_state.update(copy.deepcopy(COMMON_DIALOGUE_STATES))
 def retrive_response_with_ui(
         model_name: str,
         text_input: str,

src/content/playground.py CHANGED Viewed

@@ -8,10 +8,11 @@ from src.utils import bytes_to_array, array_to_bytes
 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
-    DEFAULT_DIALOGUE_STATES,
     init_state_section,
     header_section,
     sidebar_fragment,
     retrive_response_with_ui
 )
@@ -126,7 +127,7 @@ def bottom_input_section():
         st.button(
             'Clear',
             disabled=st.session_state.disprompt,
-            on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
         )
     with bottom_cols[1]:
@@ -225,7 +226,7 @@ def playground_page():
         <strong>Paralinguistics</strong> tasks.
         This playground currently only support <strong>single-round</strong> conversation.
         """,
-        concise_description=" It currently only support <strong>single-round</strong> conversation."
         )
     with st.sidebar:

 from src.content.common import (
     MODEL_NAMES,
     AUDIO_SAMPLES_W_INSTRUCT,
+    PLAYGROUND_DIALOGUE_STATES,
     init_state_section,
     header_section,
     sidebar_fragment,
+    reset_states,
     retrive_response_with_ui
 )
         st.button(
             'Clear',
             disabled=st.session_state.disprompt,
+            on_click=lambda: reset_states(PLAYGROUND_DIALOGUE_STATES)
         )
     with bottom_cols[1]:
         <strong>Paralinguistics</strong> tasks.
         This playground currently only support <strong>single-round</strong> conversation.
         """,
+        concise_description=" This playground currently only support <strong>single-round</strong> conversation."
         )
     with st.sidebar:

src/content/voice_chat.py CHANGED Viewed

@@ -4,20 +4,26 @@ import base64
 import numpy as np
 import streamlit as st
-from src.generation import MAX_AUDIO_LENGTH
-from src.utils import bytes_to_array, array_to_bytes
 from src.content.common import (
     MODEL_NAMES,
-    DEFAULT_DIALOGUE_STATES,
     init_state_section,
     header_section,
     sidebar_fragment,
     retrive_response_with_ui
 )
 # TODO: change this.
-DEFAULT_PROMPT = "Based on the information in this user’s voice, please reply the user in a friendly and helpful way."
 def _update_audio(audio_bytes):
@@ -30,13 +36,12 @@ def _update_audio(audio_bytes):
 def bottom_input_section():
-    st.info(":bulb: Ask something with clear intention.")
     bottom_cols = st.columns([0.03, 0.97])
     with bottom_cols[0]:
         st.button(
             'Clear',
             disabled=st.session_state.disprompt,
-            on_click=lambda: st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
         )
     with bottom_cols[1]:
@@ -45,7 +50,6 @@ def bottom_input_section():
             label_visibility="collapsed",
             on_change=lambda: st.session_state.update(
                 on_record=True,
-                vc_messages=[],
                 disprompt=True
                 ),
             key='record'
@@ -56,13 +60,25 @@ def bottom_input_section():
             _update_audio(audio_bytes)
             st.session_state.update(
                 on_record=False,
-                new_prompt=DEFAULT_PROMPT
             )
 def conversation_section():
     for message in st.session_state.vc_messages:
-        with st.chat_message(message["role"]):
             if message.get("error"):
                 st.error(message["error"])
             for warning_msg in message.get("warnings", []):
@@ -75,53 +91,74 @@ def conversation_section():
     with st._bottom:
         bottom_input_section()
-    if one_time_prompt := st.session_state.new_prompt:
-        one_time_array = st.session_state.vc_audio_array
-        one_time_base64 = st.session_state.vc_audio_base64
-        st.session_state.update(
-            new_prompt="",
-            one_time_array=np.array([]),
-            one_time_base64="",
-            vc_messages=[]
-        )
-        with st.chat_message("user"):
-            st.audio(one_time_array, format="audio/wav", sample_rate=16000)
-        st.session_state.vc_messages.append({"role": "user", "audio": one_time_array})
-        with st.chat_message("assistant"):
-            with st.spinner("Thinking..."):
-                error_msg, warnings, response = retrive_response_with_ui(
-                    model_name=MODEL_NAMES["audiollm-it"]["vllm_name"],
-                    text_input=one_time_prompt,
-                    array_audio_input=one_time_array,
-                    base64_audio_input=one_time_base64,
-                    stream=True
-                )
-        st.session_state.vc_messages.append({
-            "role": "assistant",
-            "error": error_msg,
-            "warnings": warnings,
-            "content": response
-        })
-        st.session_state.disprompt=False
-        st.rerun(scope="app")
 def voice_chat_page():
     init_state_section()
     header_section(
         component_name="Voice Chat",
-        description=""" It currently only support <strong>single-round</strong> conversation.
         Feel free to talk about anything.""",
-        concise_description=" It currently only support <strong>single-round</strong> conversation.",
         icon="🗣️"
         )
     with st.sidebar:
         sidebar_fragment()
     conversation_section()

 import numpy as np
 import streamlit as st
+from src.generation import (
+    MAX_AUDIO_LENGTH,
+    prepare_multimodal_content,
+    change_multimodal_content
+)
 from src.content.common import (
     MODEL_NAMES,
+    VOICE_CHAT_DIALOGUE_STATES,
     init_state_section,
     header_section,
     sidebar_fragment,
+    reset_states,
     retrive_response_with_ui
 )
+from src.utils import bytes_to_array, array_to_bytes
 # TODO: change this.
+DEFAULT_PROMPT = "Based on the information in this user’s voice, please reply to the user in a friendly and helpful way."
+MAX_VC_ROUNDS = 5
 def _update_audio(audio_bytes):
 def bottom_input_section():
     bottom_cols = st.columns([0.03, 0.97])
     with bottom_cols[0]:
         st.button(
             'Clear',
             disabled=st.session_state.disprompt,
+            on_click=lambda: reset_states(VOICE_CHAT_DIALOGUE_STATES)
         )
     with bottom_cols[1]:
             label_visibility="collapsed",
             on_change=lambda: st.session_state.update(
                 on_record=True,
                 disprompt=True
                 ),
             key='record'
             _update_audio(audio_bytes)
             st.session_state.update(
                 on_record=False,
             )
+@st.fragment
+def system_prompt_fragment():
+    with st.expander("System Prompt"):
+        st.text_area(
+            label="Insert system instructions or background knowledge here.",
+            label_visibility="collapsed",
+            max_chars=5000,
+            key="system_prompt",
+            value=DEFAULT_PROMPT,
+        )
 def conversation_section():
+    chat_message_container = st.container(height=480)
     for message in st.session_state.vc_messages:
+        with chat_message_container.chat_message(message["role"]):
             if message.get("error"):
                 st.error(message["error"])
             for warning_msg in message.get("warnings", []):
     with st._bottom:
         bottom_input_section()
+    if not st.session_state.vc_audio_base64:
+        return
+    if len(st.session_state.vc_messages) >= MAX_VC_ROUNDS * 2:
+        st.toast(f":warning: max conversation rounds ({MAX_VC_ROUNDS}) reached!")
+        return
+    one_time_prompt = DEFAULT_PROMPT
+    one_time_array = st.session_state.vc_audio_array
+    one_time_base64 = st.session_state.vc_audio_base64
+    st.session_state.update(
+        vc_audio_array=np.array([]),
+        vc_audio_base64="",
+    )
+    with chat_message_container.chat_message("user"):
+        st.audio(one_time_array, format="audio/wav", sample_rate=16000)
+    st.session_state.vc_messages.append({"role": "user", "audio": one_time_array})
+    if not st.session_state.vc_model_messages:
+        one_time_prompt = st.session_state.system_prompt
+    else:
+        st.session_state.vc_model_messages[0]["content"] = change_multimodal_content(
+            st.session_state.vc_model_messages[0]["content"],
+            text_input=st.session_state.system_prompt
+        )
+    with chat_message_container.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            error_msg, warnings, response = retrive_response_with_ui(
+                model_name=MODEL_NAMES["audiollm-it"]["vllm_name"],
+                text_input=one_time_prompt,
+                array_audio_input=one_time_array,
+                base64_audio_input=one_time_base64,
+                stream=True,
+                history=st.session_state.vc_model_messages
+            )
+    st.session_state.vc_messages.append({
+        "role": "assistant",
+        "error": error_msg,
+        "warnings": warnings,
+        "content": response
+    })
+    mm_content = prepare_multimodal_content(one_time_prompt, one_time_base64)
+    st.session_state.vc_model_messages.extend([
+        {"role": "user", "content": mm_content},
+        {"role": "assistant", "content": response}
+    ])
+    st.session_state.disprompt=False
+    st.rerun(scope="app")
 def voice_chat_page():
     init_state_section()
     header_section(
         component_name="Voice Chat",
+        description=""" It currently only support up to <strong>5 rounds</strong> of conversations.
         Feel free to talk about anything.""",
+        concise_description=" It currently only support up to <strong>5 rounds</strong> of conversations.",
         icon="🗣️"
         )
     with st.sidebar:
         sidebar_fragment()
+    system_prompt_fragment()
     conversation_section()

src/generation.py CHANGED Viewed

@@ -40,6 +40,45 @@ def load_model() -> Dict:
     return name_to_client_mapper
 def _retrive_response(
         model: str,
         text_input: str,

     return name_to_client_mapper
+def prepare_multimodal_content(text_input, base64_audio_input):
+    return [
+        {
+            "type": "text",
+            "text": f"Text instruction: {text_input}"
+        },
+        {
+            "type": "audio_url",
+            "audio_url": {
+                "url": f"data:audio/ogg;base64,{base64_audio_input}"
+            },
+        },
+    ]
+def change_multimodal_content(
+        original_content,
+        text_input="",
+        base64_audio_input=""):
+    # Since python 3.7 dictionary is ordered.
+    if text_input:
+        original_content[0] = {
+            "type": "text",
+            "text": f"Text instruction: {text_input}"
+        }
+    if base64_audio_input:
+        original_content[1] = {
+            "type": "audio_url",
+            "audio_url": {
+                "url": f"data:audio/ogg;base64,{base64_audio_input}"
+            }
+        }
+    return original_content
 def _retrive_response(
         model: str,
         text_input: str,