Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

MERaLiON-AudioLLM / src /content /agent.py

YingxuHe

apply new backend

8b1f711 about 1 month ago

raw

history blame contribute delete

9.77 kB

	import os
	import requests

	import numpy as np
	import streamlit as st

	from src.retrieval import STANDARD_QUERIES
	from src.content.common import (
	MODEL_NAMES,
	AUDIO_SAMPLES_W_INSTRUCT,
	AGENT_DIALOGUE_STATES,
	reset_states,
	update_voice_instruction_state,
	init_state_section,
	header_section,
	sidebar_fragment,
	successful_example_section,
	audio_attach_dialogue,
	retrive_response_with_ui
	)


	API_BASE_URL = os.getenv('API_BASE_URL')


	LLM_NO_AUDIO_PROMPT_TEMPLATE = """{user_question}"""


	LLM_PROMPT_TEMPLATE = """User asked a question about the audio clip.

	## User Question
	{user_question}

	{audio_information_prompt}Please reply to user's question with a friendly, accurate, and helpful answer."""


	AUDIO_INFO_TEMPLATE = """Here are some information about this audio clip.

	## Audio Information
	{audio_information}

	However, the audio analysis may or may not contain relevant information to the user question, please only reply the user with the relevant information.

	"""


	AUDIO_ANALYSIS_STATUS = "MERaLiON-AudioLLM Analysis"


	AG_CONVERSATION_STATES = dict(
	ag_messages=[],
	ag_model_messages=[],
	ag_visited_query_indices=[],
	)


	def bottom_input_section():
	bottom_cols = st.columns([0.03, 0.03, 0.91, 0.03])
	with bottom_cols[0]:
	st.button(
	':material/delete:',
	disabled=st.session_state.disprompt,
	on_click=lambda: reset_states(AGENT_DIALOGUE_STATES)
	)

	with bottom_cols[1]:
	if st.button(":material/add:", disabled=st.session_state.disprompt):
	audio_attach_dialogue(
	audio_array_state="ag_audio_array",
	audio_base64_state="ag_audio_base64",
	restore_state=AG_CONVERSATION_STATES
	)

	with bottom_cols[2]:
	if chat_input := st.chat_input(
	placeholder="Instruction...",
	disabled=st.session_state.disprompt,
	on_submit=lambda: st.session_state.update(disprompt=True)
	):
	st.session_state.new_prompt = chat_input

	with bottom_cols[3]:
	uploaded_voice = st.audio_input(
	label="voice_instruction",
	label_visibility="collapsed",
	disabled=st.session_state.disprompt,
	on_change=lambda: st.session_state.update(
	disprompt=True,
	on_record_voice_instruction=True
	),
	key='voice_instruction'
	)

	if uploaded_voice and st.session_state.on_record_voice_instruction:
	voice_bytes = uploaded_voice.read()
	update_voice_instruction_state(voice_bytes)
	st.session_state.on_record_voice_instruction = False


	def _prepare_final_prompt_with_ui(one_time_prompt):
	if st.session_state.ag_audio_array.shape[0] == 0:
	return LLM_NO_AUDIO_PROMPT_TEMPLATE.format(user_question=one_time_prompt)

	with st.spinner("Searching appropriate querys..."):
	response = requests.get(
	f"{API_BASE_URL}retrieve_relevant_docs",
	params={"user_question": one_time_prompt}
	)
	relevant_query_indices = response.json()

	if len(st.session_state.ag_messages) <= 2:
	relevant_query_indices.append(0)

	relevant_query_indices = list(
	set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices)
	)

	st.session_state.ag_visited_query_indices.extend(relevant_query_indices)

	if not relevant_query_indices:
	return LLM_PROMPT_TEMPLATE.format(
	user_question=one_time_prompt,
	audio_information_prompt=""
	)

	audio_info = []
	with st.status(AUDIO_ANALYSIS_STATUS, expanded=False) as status:
	for i, standard_idx in enumerate(relevant_query_indices):
	new_label = (
	f"{AUDIO_ANALYSIS_STATUS}: "
	f"{STANDARD_QUERIES[standard_idx]['ui_text']} "
	f"({i+1}/{len(relevant_query_indices)})"
	)

	status.update(label=new_label, state="running")
	error_msg, warnings, response = retrive_response_with_ui(
	model_name=MODEL_NAMES["audiollm"]["vllm_name"],
	text_input=STANDARD_QUERIES[standard_idx]["query_text"],
	array_audio_input=st.session_state.ag_audio_array,
	base64_audio_input=st.session_state.ag_audio_base64,
	prefix=f"{STANDARD_QUERIES[standard_idx]['ui_text']}: ",
	stream=True,
	show_warning=i==0
	)
	audio_info.append(STANDARD_QUERIES[standard_idx]["response_prefix_text"] + response)

	st.session_state.ag_messages[-1]["process"].append({
	"error": error_msg,
	"warnings": warnings,
	"content": response
	})

	status.update(label=AUDIO_ANALYSIS_STATUS, state="complete")

	audio_information_prompt = AUDIO_INFO_TEMPLATE.format(
	audio_information="\n".join(audio_info)
	)

	return LLM_PROMPT_TEMPLATE.format(
	user_question=one_time_prompt,
	audio_information_prompt=audio_information_prompt
	)


	def conversation_section():
	chat_message_container = st.container(height=480)
	if st.session_state.ag_audio_array.size:
	with chat_message_container.chat_message("user"):
	st.audio(st.session_state.ag_audio_array, format="audio/wav", sample_rate=16000)

	for message in st.session_state.ag_messages:
	with chat_message_container.chat_message(name=message["role"]):
	if message.get("error"):
	st.error(message["error"])
	for warning_msg in message.get("warnings", []):
	st.warning(warning_msg)
	if process := message.get("process", []):
	with st.status(AUDIO_ANALYSIS_STATUS, expanded=False, state="complete"):
	for proc in process:
	if proc.get("error"):
	st.error(proc["error"])
	for proc_warning_msg in proc.get("warnings", []):
	st.warning(proc_warning_msg)
	if proc.get("content"):
	st.write(proc["content"])
	if message.get("content"):
	st.write(message["content"])

	with st._bottom:
	bottom_input_section()

	if (not st.session_state.new_prompt) and (not st.session_state.new_vi_base64):
	return

	one_time_prompt = st.session_state.new_prompt
	one_time_vi_array = st.session_state.new_vi_array
	one_time_vi_base64 = st.session_state.new_vi_base64

	st.session_state.update(
	new_prompt="",
	new_vi_array=np.array([]),
	new_vi_base64="",
	)

	with chat_message_container.chat_message("user"):
	if one_time_vi_base64:
	with st.spinner("Transcribing..."):
	error_msg, warnings, one_time_prompt = retrive_response_with_ui(
	model_name=MODEL_NAMES["audiollm"]["vllm_name"],
	text_input="Write out the dialogue as text.",
	array_audio_input=one_time_vi_array,
	base64_audio_input=one_time_vi_base64,
	stream=False,
	normalise_response=True
	)
	else:
	error_msg, warnings = "", []
	st.write(one_time_prompt)

	st.session_state.ag_messages.append({
	"role": "user",
	"error": error_msg,
	"warnings": warnings,
	"content": one_time_prompt
	})

	with chat_message_container.chat_message("assistant"):
	assistant_message = {"role": "assistant", "process": []}
	st.session_state.ag_messages.append(assistant_message)

	final_prompt = _prepare_final_prompt_with_ui(one_time_prompt)

	llm_response_prefix = f"{MODEL_NAMES['llm']['ui_name']}: "
	error_msg, warnings, response = retrive_response_with_ui(
	model_name=MODEL_NAMES["llm"]["vllm_name"],
	text_input=final_prompt,
	array_audio_input=st.session_state.ag_audio_array,
	base64_audio_input="",
	prefix=llm_response_prefix,
	stream=True,
	history=st.session_state.ag_model_messages,
	show_warning=False
	)

	assistant_message.update({
	"error": error_msg,
	"warnings": warnings,
	"content": response
	})

	pure_response = response.replace(llm_response_prefix, "")
	st.session_state.ag_model_messages.extend([
	{"role": "user", "content": final_prompt},
	{"role": "assistant", "content": pure_response}
	])

	st.session_state.disprompt=False
	st.rerun(scope="app")


	def agent_page():
	init_state_section()
	header_section(
	component_name="Chatbot",
	description=""" It is implemented by <strong>connecting multiple AI models</strong>,
	offers more flexibility, and supports <strong>multi-round</strong> conversation.""",
	concise_description=""" It is implemented by connecting multiple AI models and
	support <strong>multi-round</strong> conversation.""",
	icon="👥"
	)

	with st.sidebar:
	sidebar_fragment()

	audio_sample_names = [name for name in AUDIO_SAMPLES_W_INSTRUCT.keys() if "Paral" in name]

	successful_example_section(
	audio_sample_names,
	audio_array_state="ag_audio_array",
	audio_base64_state="ag_audio_base64",
	restore_state=AG_CONVERSATION_STATES
	)
	conversation_section()