File size: 4,979 Bytes
8573823
 
 
89ed0ae
 
 
 
8573823
 
89ed0ae
e9402b5
 
8573823
 
 
 
 
 
 
 
89ed0ae
 
8573823
 
 
 
 
 
e9402b5
8573823
89ed0ae
8573823
 
 
 
 
 
e9402b5
8573823
 
 
 
 
 
 
 
 
e9402b5
 
8573823
 
 
 
 
89ed0ae
 
 
 
 
 
e9402b5
89ed0ae
 
 
 
 
 
8573823
89ed0ae
8573823
89ed0ae
8573823
 
 
 
 
 
 
 
 
 
 
 
89ed0ae
 
8573823
89ed0ae
 
 
8573823
89ed0ae
 
 
 
 
 
 
 
 
 
8573823
89ed0ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8573823
 
 
 
 
 
e9402b5
8573823
e9402b5
8573823
 
 
 
 
 
89ed0ae
8573823
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import numpy as np
import streamlit as st

from src.generation import (
    prepare_multimodal_content, 
    change_multimodal_content
)
from src.content.common import (
    MODEL_NAMES,
    VOICE_CHAT_DIALOGUE_STATES,
    reset_states,
    process_audio_bytes,
    init_state_section,
    header_section,
    sidebar_fragment,
    retrive_response_with_ui
)


# TODO: change this.
DEFAULT_PROMPT = "Based on the information in this user’s voice, please reply to the user in a friendly and helpful way."
MAX_VC_ROUNDS = 5


def bottom_input_section():
    bottom_cols = st.columns([0.03, 0.97])
    with bottom_cols[0]:
        st.button(
            ':material/delete:', 
            disabled=st.session_state.disprompt,
            on_click=lambda: reset_states(VOICE_CHAT_DIALOGUE_STATES)
        )

    with bottom_cols[1]:
        uploaded_file = st.audio_input(
            label="record audio",
            label_visibility="collapsed",
            disabled=st.session_state.disprompt,
            on_change=lambda: st.session_state.update(
                on_record=True, 
                disprompt=True
                ),
            key='record'
        )

        if uploaded_file and st.session_state.on_record:
            audio_bytes = uploaded_file.read()
            st.session_state.vc_audio_array, st.session_state.vc_audio_base64 = \
                process_audio_bytes(audio_bytes)
            st.session_state.update(
                on_record=False,
            )


@st.fragment
def system_prompt_fragment():
    with st.expander("System Prompt"):
        st.text_area(
            label="Insert system instructions or background knowledge here.",
            label_visibility="collapsed",
            disabled=st.session_state.disprompt,
            max_chars=5000,
            key="system_prompt",
            value=DEFAULT_PROMPT,
        )


def conversation_section():
    chat_message_container = st.container(height=480)
    for message in st.session_state.vc_messages:
        with chat_message_container.chat_message(message["role"]):
            if message.get("error"):
                st.error(message["error"])
            for warning_msg in message.get("warnings", []):
                st.warning(warning_msg)
            if message.get("audio", np.array([])).shape[0]:
                st.audio(message["audio"], format="audio/wav", sample_rate=16000)
            if message.get("content"):
                st.write(message["content"])
    
    with st._bottom:
        bottom_input_section()

    if not st.session_state.vc_audio_base64:
        return

    if len(st.session_state.vc_messages) >= MAX_VC_ROUNDS * 2:
        st.toast(f":warning: max conversation rounds ({MAX_VC_ROUNDS}) reached!")
        return

    one_time_prompt = DEFAULT_PROMPT
    one_time_array = st.session_state.vc_audio_array
    one_time_base64 = st.session_state.vc_audio_base64
    st.session_state.update(
        vc_audio_array=np.array([]),
        vc_audio_base64="",
    )

    with chat_message_container.chat_message("user"):
        st.audio(one_time_array, format="audio/wav", sample_rate=16000)
    
    st.session_state.vc_messages.append({"role": "user", "audio": one_time_array})

    if not st.session_state.vc_model_messages:
        one_time_prompt = st.session_state.system_prompt
    else:
        st.session_state.vc_model_messages[0]["content"] = change_multimodal_content(
            st.session_state.vc_model_messages[0]["content"],
            text_input=st.session_state.system_prompt
        )

    with chat_message_container.chat_message("assistant"):
        with st.spinner("Thinking..."):
            error_msg, warnings, response = retrive_response_with_ui(
                model_name=MODEL_NAMES["audiollm-it"]["vllm_name"],
                text_input=one_time_prompt, 
                array_audio_input=one_time_array,
                base64_audio_input=one_time_base64, 
                stream=True,
                history=st.session_state.vc_model_messages
            )

    st.session_state.vc_messages.append({
        "role": "assistant", 
        "error": error_msg,
        "warnings": warnings, 
        "content": response
    })

    mm_content = prepare_multimodal_content(one_time_prompt, one_time_base64)
    st.session_state.vc_model_messages.extend([
        {"role": "user", "content": mm_content},
        {"role": "assistant", "content": response}
    ])

    st.session_state.disprompt=False
    st.rerun(scope="app")


def voice_chat_page():
    init_state_section()
    header_section(
        component_name="Voice Chat",
        description=""" Currently support up to <strong>5 rounds</strong> of conversations.
        Feel free to talk about anything.""",
        concise_description=" Currently support up to <strong>5 rounds</strong> of conversations.",
        icon="🗣️"
        )

    with st.sidebar:
        sidebar_fragment()

    system_prompt_fragment()
    conversation_section()