diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0cafc1cde1985c69113a5b2ae7ba42299aa7ebc2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.venv/
\ No newline at end of file
diff --git a/README.md b/README.md
index 3234dbf39e30b49a2a472d1d78b9e0a304a70a67..7dc1c6667a8e42872ec41e4f92e99e5dfbf9526d 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,16 @@
 ---
-title: MERaLiON AudioLLM DEV
-emoji: 🌍
-colorFrom: purple
-colorTo: gray
+title: Interactive-Demo / MERaLiON-AudioLLM
+emoji: 🚀
+colorFrom: indigo
+colorTo: indigo
 sdk: streamlit
-sdk_version: 1.43.1
+sdk_version: 1.41.1
 app_file: app.py
-pinned: false
-short_description: Development version of web demo
+pinned: true
+models:
+- MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION
 ---
 
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+
+You need to set API_BASE_URL=http://xxxxx:port/
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..e497863323cffda8c0dcd856a502419068d5c166
--- /dev/null
+++ b/app.py
@@ -0,0 +1,3 @@
+from src.content.playground import playground_page
+
+playground_page()
\ No newline at end of file
diff --git a/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav b/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav
new file mode 100644
index 0000000000000000000000000000000000000000..d1fa45ef18df244757240868794d54063f723d56
Binary files /dev/null and b/audio_samples/10_ASR_IMDA_PART4_30_ASR_v2_1527.wav differ
diff --git a/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav b/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f8c81dfdd16b9167a273a3964a6a3dd47de80d98
Binary files /dev/null and b/audio_samples/11_ASR_IMDA_PART4_30_ASR_v2_3771.wav differ
diff --git a/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav b/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav
new file mode 100644
index 0000000000000000000000000000000000000000..186c8c090c344c63e0ea828cc12a51207df58aff
Binary files /dev/null and b/audio_samples/12_ASR_IMDA_PART4_30_ASR_v2_103.wav differ
diff --git a/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav b/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav
new file mode 100644
index 0000000000000000000000000000000000000000..0e606bfd770bb85abab4957d0380fef221c254c4
Binary files /dev/null and b/audio_samples/13_ASR_IMDA_PART5_30_ASR_v2_1446.wav differ
diff --git a/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav b/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav
new file mode 100644
index 0000000000000000000000000000000000000000..cbf2fad1cc780f4a7d691974033dcef7fd0fcef0
Binary files /dev/null and b/audio_samples/14_ASR_IMDA_PART5_30_ASR_v2_2281.wav differ
diff --git a/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav b/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav
new file mode 100644
index 0000000000000000000000000000000000000000..8ea95dd7dbab7762154da8588b7ead701317e956
Binary files /dev/null and b/audio_samples/15_ASR_IMDA_PART5_30_ASR_v2_4388.wav differ
diff --git a/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav b/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav
new file mode 100644
index 0000000000000000000000000000000000000000..df7b46f0a8f4f93b280cd9c91e486956e5d89b11
Binary files /dev/null and b/audio_samples/16_ASR_IMDA_PART6_30_ASR_v2_576.wav differ
diff --git a/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav b/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav
new file mode 100644
index 0000000000000000000000000000000000000000..69c2b3ee39465bc58efe769fde69c95c9d5092fc
Binary files /dev/null and b/audio_samples/17_ASR_IMDA_PART6_30_ASR_v2_1413.wav differ
diff --git a/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav b/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1d89e648d87d2bc193f728ac86b54ea7a4e07634
Binary files /dev/null and b/audio_samples/18_ASR_IMDA_PART6_30_ASR_v2_2834.wav differ
diff --git a/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav b/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav
new file mode 100644
index 0000000000000000000000000000000000000000..b296224725ec5acf74a02304f6beb6a7723d2c89
Binary files /dev/null and b/audio_samples/19_ASR_AIShell_zh_ASR_v2_5044.wav differ
diff --git a/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav b/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav
new file mode 100644
index 0000000000000000000000000000000000000000..17c5fc99647aaa658eabe035b40f97f8ea7638d1
Binary files /dev/null and b/audio_samples/1_ASR_IMDA_PART1_ASR_v2_141.wav differ
diff --git a/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav b/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1ec609efd1c3790487c3c0aec77e5e3e5b0c3eda
Binary files /dev/null and b/audio_samples/20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833.wav differ
diff --git a/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav b/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav
new file mode 100644
index 0000000000000000000000000000000000000000..55c435f3b228e136e3c1047a4b43e992b9acfc0f
Binary files /dev/null and b/audio_samples/25_ST_COVOST2_ZH-CN_EN_ST_V2_4567.wav differ
diff --git a/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav b/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f2780b3b7da1d553f59f4f29256b4e848049cf52
Binary files /dev/null and b/audio_samples/26_ST_COVOST2_EN_ZH-CN_ST_V2_5422.wav differ
diff --git a/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav b/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav
new file mode 100644
index 0000000000000000000000000000000000000000..234f811d4c60ab67659f06bcd1db481a11648ca9
Binary files /dev/null and b/audio_samples/27_ST_COVOST2_EN_ZH-CN_ST_V2_6697.wav differ
diff --git a/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav b/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav
new file mode 100644
index 0000000000000000000000000000000000000000..239fff4d4cfcf2653e00d97ca842f334bd31ed18
Binary files /dev/null and b/audio_samples/28_SI_ALPACA-GPT4-AUDIO_SI_V2_299.wav differ
diff --git a/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav b/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav
new file mode 100644
index 0000000000000000000000000000000000000000..35d9dfbdc9ca3169a05c50a548cb5836adc65d52
Binary files /dev/null and b/audio_samples/29_SI_ALPACA-GPT4-AUDIO_SI_V2_750.wav differ
diff --git a/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav b/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav
new file mode 100644
index 0000000000000000000000000000000000000000..1b3ff08f36d5e02043445bd8c0f37b73cdd9f59c
Binary files /dev/null and b/audio_samples/2_ASR_IMDA_PART1_ASR_v2_2258.wav differ
diff --git a/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav b/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav
new file mode 100644
index 0000000000000000000000000000000000000000..d84f6abdca95d5bfa3f292f45b370c243bf79f86
Binary files /dev/null and b/audio_samples/30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454.wav differ
diff --git a/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav b/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav
new file mode 100644
index 0000000000000000000000000000000000000000..e0d9a7f61f0a8b0137bc8c5ddd4d03c02686b49b
Binary files /dev/null and b/audio_samples/31_SI_OPENHERMES-AUDIO_SI_V2_673.wav differ
diff --git a/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav b/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav
new file mode 100644
index 0000000000000000000000000000000000000000..4f0aadf1e9ac1e100c052fa9df0760651e2b2c4f
Binary files /dev/null and b/audio_samples/32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572.wav differ
diff --git a/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav b/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav
new file mode 100644
index 0000000000000000000000000000000000000000..c2858560478a1b51a6085e0f54a34d4bbca30b8e
Binary files /dev/null and b/audio_samples/33_SQA_IMDA_PART3_30_SQA_V2_2310.wav differ
diff --git a/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav b/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav
new file mode 100644
index 0000000000000000000000000000000000000000..e4f53b20b6210ef6bba708ea1bccb9ad787caf22
Binary files /dev/null and b/audio_samples/34_SQA_IMDA_PART3_30_SQA_V2_3621.wav differ
diff --git a/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav b/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav
new file mode 100644
index 0000000000000000000000000000000000000000..8e18d39cdceaa84abc9dff3f002a0c6502c30b69
Binary files /dev/null and b/audio_samples/35_SQA_IMDA_PART3_30_SQA_V2_4062.wav differ
diff --git a/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav b/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav
new file mode 100644
index 0000000000000000000000000000000000000000..6b381a7b04f312f0b317bd3b6a0581155aeaf4c1
Binary files /dev/null and b/audio_samples/36_DS_IMDA_PART4_30_DS_V2_849.wav differ
diff --git a/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav b/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav
new file mode 100644
index 0000000000000000000000000000000000000000..738c14bf9ff890820659be0ad4d27ec5576ea7c4
Binary files /dev/null and b/audio_samples/39_Paralingual_IEMOCAP_ER_V2_91.wav differ
diff --git a/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav b/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav
new file mode 100644
index 0000000000000000000000000000000000000000..507bca925cbe5c433d1021c89f8f5c2108fc00d6
Binary files /dev/null and b/audio_samples/3_ASR_IMDA_PART1_ASR_v2_2265.wav differ
diff --git a/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav b/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav
new file mode 100644
index 0000000000000000000000000000000000000000..6709d5a7b3509690d89d222e8a75120b0a9c4d35
Binary files /dev/null and b/audio_samples/40_Paralingual_IEMOCAP_ER_V2_567.wav differ
diff --git a/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav b/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav
new file mode 100644
index 0000000000000000000000000000000000000000..593e18ad1ff04af7877072ba964c323786ba580e
Binary files /dev/null and b/audio_samples/42_Paralingual_IEMOCAP_GR_V2_320.wav differ
diff --git a/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav b/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav
new file mode 100644
index 0000000000000000000000000000000000000000..cd143063c19ca28fb3820ded2f1caa2cda0a8861
Binary files /dev/null and b/audio_samples/43_Paralingual_IEMOCAP_GR_V2_129.wav differ
diff --git a/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav b/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav
new file mode 100644
index 0000000000000000000000000000000000000000..af4fe23487085a3047ad1f0f56b824a6a75907f4
Binary files /dev/null and b/audio_samples/45_Paralingual_IMDA_PART3_30_GR_V2_12312.wav differ
diff --git a/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav b/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav
new file mode 100644
index 0000000000000000000000000000000000000000..20a685bb51cd1670280e104e1f06987e471657bb
Binary files /dev/null and b/audio_samples/47_Paralingual_IMDA_PART3_30_NR_V2_10479.wav differ
diff --git a/audio_samples/49_Paralingual_MELD_ER_V2_676.wav b/audio_samples/49_Paralingual_MELD_ER_V2_676.wav
new file mode 100644
index 0000000000000000000000000000000000000000..a614033adb66d5d8b5a0054530336876c0d61d86
Binary files /dev/null and b/audio_samples/49_Paralingual_MELD_ER_V2_676.wav differ
diff --git a/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav b/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav
new file mode 100644
index 0000000000000000000000000000000000000000..48bfb135fc3eb12814801c49abd0b8250178ad86
Binary files /dev/null and b/audio_samples/4_ASR_IMDA_PART2_ASR_v2_999.wav differ
diff --git a/audio_samples/50_Paralingual_MELD_ER_V2_692.wav b/audio_samples/50_Paralingual_MELD_ER_V2_692.wav
new file mode 100644
index 0000000000000000000000000000000000000000..69f435f7308b5090f2668d22c1f324d30dd8857e
Binary files /dev/null and b/audio_samples/50_Paralingual_MELD_ER_V2_692.wav differ
diff --git a/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav b/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav
new file mode 100644
index 0000000000000000000000000000000000000000..42d4d89846cfcd0c6bb0de173f584ad2b6d6d131
Binary files /dev/null and b/audio_samples/51_Paralingual_VOXCELEB1_GR_V2_2148.wav differ
diff --git a/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav b/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav
new file mode 100644
index 0000000000000000000000000000000000000000..ce05d92f8004d6054d39fae59f4d3a34c3b80e49
Binary files /dev/null and b/audio_samples/53_Paralingual_VOXCELEB1_NR_V2_2286.wav differ
diff --git a/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav b/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f8513f46825e7b386b1f00f058d249044dac82d2
Binary files /dev/null and b/audio_samples/55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2.wav differ
diff --git a/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav b/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav
new file mode 100644
index 0000000000000000000000000000000000000000..f95f167ebe177b0db82f346f9dbd2c51eb828ec1
Binary files /dev/null and b/audio_samples/56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415.wav differ
diff --git a/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav b/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav
new file mode 100644
index 0000000000000000000000000000000000000000..c2decc6d21300257c3fc74b6718f1898dedbf4e2
Binary files /dev/null and b/audio_samples/57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460.wav differ
diff --git a/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav b/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav
new file mode 100644
index 0000000000000000000000000000000000000000..55063388c14bd69df6a8023e5a65e4c9c3a01fb5
Binary files /dev/null and b/audio_samples/5_ASR_IMDA_PART2_ASR_v2_2241.wav differ
diff --git a/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav b/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav
new file mode 100644
index 0000000000000000000000000000000000000000..daf99a1877bae21a5ab72147a7a6359c8953e242
Binary files /dev/null and b/audio_samples/6_ASR_IMDA_PART2_ASR_v2_3409.wav differ
diff --git a/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav b/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav
new file mode 100644
index 0000000000000000000000000000000000000000..5e439cf43817a436692e132e194bcf2b43332126
Binary files /dev/null and b/audio_samples/7_ASR_IMDA_PART3_30_ASR_v2_2269.wav differ
diff --git a/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav b/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav
new file mode 100644
index 0000000000000000000000000000000000000000..e0929f09849acb481f80ca007bf257a9d937c035
Binary files /dev/null and b/audio_samples/8_ASR_IMDA_PART3_30_ASR_v2_1698.wav differ
diff --git a/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav b/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav
new file mode 100644
index 0000000000000000000000000000000000000000..11e66f37907da37aa2d90a492e407bc3a7a20bb1
Binary files /dev/null and b/audio_samples/9_ASR_IMDA_PART3_30_ASR_v2_2474.wav differ
diff --git a/pages/agent.py b/pages/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..36a9a35623385096495fc8d00d77b7302d9f4a5c
--- /dev/null
+++ b/pages/agent.py
@@ -0,0 +1,3 @@
+from src.content.agent import agent_page
+
+agent_page()
\ No newline at end of file
diff --git a/pages/playground.py b/pages/playground.py
new file mode 100644
index 0000000000000000000000000000000000000000..da5d8c7953bac6ca10d1fc69c8e755b0284f616d
--- /dev/null
+++ b/pages/playground.py
@@ -0,0 +1,4 @@
+from src.content.playground import playground_page
+
+
+playground_page()
diff --git a/pages/voice_chat.py b/pages/voice_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0627ef4f40fc78b1fc85ed7b9db34a0ee603a12
--- /dev/null
+++ b/pages/voice_chat.py
@@ -0,0 +1,4 @@
+from src.content.voice_chat import voice_chat_page
+
+
+voice_chat_page()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..26e328f99799199c667b3f8c88bb84ebd0867bc4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+librosa==0.10.2.post1
+streamlit==1.40.2
+openai==1.57.1
+streamlit_mic_recorder==0.0.8
+sshtunnel
+accelerate==1.3.0
+FlagEmbedding==1.3.3
+sentence-transformers==3.4.0
+sentencepiece==0.1.99
\ No newline at end of file
diff --git a/src/__pycache__/generation.cpython-312.pyc b/src/__pycache__/generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..525e04254207d4c1ff8a14a92486fe900c378ac2
Binary files /dev/null and b/src/__pycache__/generation.cpython-312.pyc differ
diff --git a/src/__pycache__/logger.cpython-312.pyc b/src/__pycache__/logger.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..257d66383837c2f0aadc2da313b4af3322ba8e17
Binary files /dev/null and b/src/__pycache__/logger.cpython-312.pyc differ
diff --git a/src/__pycache__/retrieval.cpython-312.pyc b/src/__pycache__/retrieval.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3090438e9e49cd0137120011c6117e2a6190687d
Binary files /dev/null and b/src/__pycache__/retrieval.cpython-312.pyc differ
diff --git a/src/__pycache__/utils.cpython-312.pyc b/src/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..111dcd593cab34442f7888dff10fdbca31ce48a1
Binary files /dev/null and b/src/__pycache__/utils.cpython-312.pyc differ
diff --git a/src/content/__pycache__/agent.cpython-312.pyc b/src/content/__pycache__/agent.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4bf1b6424ebf500d2391a6d2463b25992c137c9
Binary files /dev/null and b/src/content/__pycache__/agent.cpython-312.pyc differ
diff --git a/src/content/__pycache__/common.cpython-312.pyc b/src/content/__pycache__/common.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8ec8ccd7a2832d57236b04c29d3636b5b066bd1
Binary files /dev/null and b/src/content/__pycache__/common.cpython-312.pyc differ
diff --git a/src/content/__pycache__/playground.cpython-312.pyc b/src/content/__pycache__/playground.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41dadcb02c960193685c51fa63b2fcd1ba30b2ec
Binary files /dev/null and b/src/content/__pycache__/playground.cpython-312.pyc differ
diff --git a/src/content/agent.py b/src/content/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6b0b1a60d38b96c0699561fa969ed16dfc3d42
--- /dev/null
+++ b/src/content/agent.py
@@ -0,0 +1,282 @@
+import numpy as np
+import streamlit as st
+import requests
+import os
+from src.retrieval import STANDARD_QUERIES
+
+from src.content.common import (
+    MODEL_NAMES,
+    AUDIO_SAMPLES_W_INSTRUCT, 
+    AGENT_DIALOGUE_STATES,
+    reset_states,
+    update_voice_instruction_state,
+    init_state_section,
+    header_section,
+    sidebar_fragment,
+    successful_example_section,
+    audio_attach_dialogue,
+    retrive_response_with_ui
+)
+
+API_BASE_URL = os.getenv('API_BASE_URL')
+
+LLM_NO_AUDIO_PROMPT_TEMPLATE = """{user_question}"""
+
+
+LLM_PROMPT_TEMPLATE = """User asked a question about the audio clip.
+
+## User Question
+{user_question}
+
+{audio_information_prompt}Please reply to user's question with a friendly, accurate, and helpful answer."""
+
+
+AUDIO_INFO_TEMPLATE = """Here are some information about this audio clip.
+
+## Audio Information
+{audio_information}
+
+However, the audio analysis may or may not contain relevant information to the user question, please only reply the user with the relevant information.
+
+"""
+
+
+AUDIO_ANALYSIS_STATUS = "MERaLiON-AudioLLM Analysis"
+
+
+AG_CONVERSATION_STATES = dict(
+    ag_messages=[],
+    ag_model_messages=[],
+    ag_visited_query_indices=[],
+)
+
+
+def bottom_input_section():
+    bottom_cols = st.columns([0.03, 0.03, 0.91, 0.03])
+    with bottom_cols[0]:
+        st.button(
+            ':material/delete:', 
+            disabled=st.session_state.disprompt,
+            on_click=lambda: reset_states(AGENT_DIALOGUE_STATES)
+        )
+
+    with bottom_cols[1]:
+        if st.button(":material/add:", disabled=st.session_state.disprompt):
+            audio_attach_dialogue(
+                audio_array_state="ag_audio_array",
+                audio_base64_state="ag_audio_base64",
+                restore_state=AG_CONVERSATION_STATES
+            )
+
+    with bottom_cols[2]:
+        if chat_input := st.chat_input(
+            placeholder="Instruction...", 
+            disabled=st.session_state.disprompt, 
+            on_submit=lambda: st.session_state.update(disprompt=True)
+        ):
+            st.session_state.new_prompt = chat_input
+
+    with bottom_cols[3]:
+        uploaded_voice = st.audio_input(
+            label="voice_instruction",
+            label_visibility="collapsed", 
+            disabled=st.session_state.disprompt, 
+            on_change=lambda: st.session_state.update(
+                disprompt=True,
+                on_record_voice_instruction=True
+                ),
+            key='voice_instruction'  
+        )
+
+        if uploaded_voice and st.session_state.on_record_voice_instruction:
+            voice_bytes = uploaded_voice.read()
+            update_voice_instruction_state(voice_bytes)
+            st.session_state.on_record_voice_instruction = False
+
+
+def _prepare_final_prompt_with_ui(one_time_prompt):
+    if st.session_state.ag_audio_array.shape[0] == 0:
+        return LLM_NO_AUDIO_PROMPT_TEMPLATE.format(user_question=one_time_prompt)
+    
+    with st.spinner("Searching appropriate querys..."):
+        print(one_time_prompt)
+        response = requests.get(
+            f"{API_BASE_URL}retrieve_relevant_docs",
+            params={"user_question": one_time_prompt}
+        )
+        print(response)
+        relevant_query_indices = response.json()
+        
+        if len(st.session_state.ag_messages) <= 2:
+            relevant_query_indices.append(0)
+
+        relevant_query_indices = list(
+            set(relevant_query_indices).difference(st.session_state.ag_visited_query_indices)
+            )
+        
+        st.session_state.ag_visited_query_indices.extend(relevant_query_indices)
+
+    if not relevant_query_indices:
+        return LLM_PROMPT_TEMPLATE.format(
+            user_question=one_time_prompt, 
+            audio_information_prompt=""
+        )
+    
+    audio_info = []
+    with st.status(AUDIO_ANALYSIS_STATUS, expanded=False) as status:
+        for i, standard_idx in enumerate(relevant_query_indices):
+            new_label = (
+                f"{AUDIO_ANALYSIS_STATUS}: "
+                f"{STANDARD_QUERIES[standard_idx]['ui_text']} "
+                f"({i+1}/{len(relevant_query_indices)})"
+            )
+
+            status.update(label=new_label, state="running")
+            error_msg, warnings, response = retrive_response_with_ui(
+                model_name=MODEL_NAMES["audiollm"]["vllm_name"],
+                text_input=STANDARD_QUERIES[standard_idx]["query_text"], 
+                array_audio_input=st.session_state.ag_audio_array,
+                base64_audio_input=st.session_state.ag_audio_base64, 
+                prefix=f"**{STANDARD_QUERIES[standard_idx]['ui_text']}**: ",
+                stream=True,
+                show_warning=i==0
+            )
+            audio_info.append(STANDARD_QUERIES[standard_idx]["response_prefix_text"] + response)
+            
+            st.session_state.ag_messages[-1]["process"].append({
+                "error": error_msg,
+                "warnings": warnings, 
+                "content": response
+            })
+        
+        status.update(label=AUDIO_ANALYSIS_STATUS, state="complete")
+
+    audio_information_prompt = AUDIO_INFO_TEMPLATE.format(
+        audio_information="\n".join(audio_info)
+    )
+
+    return LLM_PROMPT_TEMPLATE.format(
+        user_question=one_time_prompt, 
+        audio_information_prompt=audio_information_prompt
+    )
+
+
+def conversation_section():
+    chat_message_container = st.container(height=480)
+    if st.session_state.ag_audio_array.size:
+        with chat_message_container.chat_message("user"):
+            st.audio(st.session_state.ag_audio_array, format="audio/wav", sample_rate=16000)
+
+    for message in st.session_state.ag_messages:
+        with chat_message_container.chat_message(name=message["role"]):
+            if message.get("error"):
+                st.error(message["error"])
+            for warning_msg in message.get("warnings", []):
+                st.warning(warning_msg)
+            if process := message.get("process", []):
+                with st.status(AUDIO_ANALYSIS_STATUS, expanded=False, state="complete"):
+                    for proc in process:
+                        if proc.get("error"):
+                            st.error(proc["error"])
+                        for proc_warning_msg in proc.get("warnings", []):
+                            st.warning(proc_warning_msg)
+                        if proc.get("content"):
+                            st.write(proc["content"])
+            if message.get("content"):
+                st.write(message["content"])
+    
+    with st._bottom:
+        bottom_input_section()
+
+    if (not st.session_state.new_prompt) and (not st.session_state.new_vi_base64):
+        return
+    
+    one_time_prompt = st.session_state.new_prompt
+    one_time_vi_array = st.session_state.new_vi_array
+    one_time_vi_base64 = st.session_state.new_vi_base64
+
+    st.session_state.update(
+        new_prompt="", 
+        new_vi_array=np.array([]),
+        new_vi_base64="",
+    )
+
+    with chat_message_container.chat_message("user"):
+        if one_time_vi_base64:
+            with st.spinner("Transcribing..."):
+                error_msg, warnings, one_time_prompt = retrive_response_with_ui(
+                    model_name=MODEL_NAMES["audiollm"]["vllm_name"],
+                    text_input="Write out the dialogue as text.", 
+                    array_audio_input=one_time_vi_array,
+                    base64_audio_input=one_time_vi_base64,
+                    stream=False,
+                    normalise_response=True
+                )
+        else:
+            error_msg, warnings = "", []
+            st.write(one_time_prompt)
+
+    st.session_state.ag_messages.append({
+        "role": "user", 
+        "error": error_msg,
+        "warnings": warnings, 
+        "content": one_time_prompt
+    })
+
+    with chat_message_container.chat_message("assistant"):
+        assistant_message = {"role": "assistant", "process": []}
+        st.session_state.ag_messages.append(assistant_message)
+
+        final_prompt = _prepare_final_prompt_with_ui(one_time_prompt)
+
+        llm_response_prefix = f"**{MODEL_NAMES['llm']['ui_name']}**: "
+        error_msg, warnings, response = retrive_response_with_ui(
+            model_name=MODEL_NAMES["llm"]["vllm_name"],
+            text_input=final_prompt, 
+            array_audio_input=st.session_state.ag_audio_array,
+            base64_audio_input="", 
+            prefix=llm_response_prefix,
+            stream=True,
+            history=st.session_state.ag_model_messages,
+            show_warning=False
+        )
+        
+        assistant_message.update({
+            "error": error_msg, 
+            "warnings": warnings, 
+            "content": response
+        })
+        
+        pure_response = response.replace(llm_response_prefix, "")
+        st.session_state.ag_model_messages.extend([
+            {"role": "user", "content": final_prompt},
+            {"role": "assistant", "content": pure_response}
+        ])
+
+    st.session_state.disprompt=False
+    st.rerun(scope="app")
+
+
+def agent_page():
+    init_state_section()
+    header_section(
+        component_name="Chatbot", 
+        description=""" It is implemented by <strong>connecting multiple AI models</strong>, 
+        offers more flexibility, and supports <strong>multi-round</strong> conversation.""",
+        concise_description=""" It is implemented by connecting multiple AI models and 
+        support <strong>multi-round</strong> conversation.""",
+        icon="👥"
+        )
+
+    with st.sidebar:
+        sidebar_fragment()
+
+    audio_sample_names = [name for name in AUDIO_SAMPLES_W_INSTRUCT.keys() if "Paral" in name]
+
+    successful_example_section(
+        audio_sample_names, 
+        audio_array_state="ag_audio_array",
+        audio_base64_state="ag_audio_base64",
+        restore_state=AG_CONVERSATION_STATES
+    )
+    conversation_section()  
\ No newline at end of file
diff --git a/src/content/common.py b/src/content/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d678a59c6b417b6ef71df8844b7011bb1a2f27
--- /dev/null
+++ b/src/content/common.py
@@ -0,0 +1,591 @@
+import os
+import copy
+import base64
+import itertools
+from collections import OrderedDict
+from typing import List, Optional
+
+import numpy as np
+import streamlit as st
+import re
+import requests
+
+from src.logger import load_logger
+from src.utils import array_to_bytes, bytes_to_array, postprocess_voice_transcription
+from src.generation import FIXED_GENERATION_CONFIG, MAX_AUDIO_LENGTH
+
+API_BASE_URL = os.getenv('API_BASE_URL')
+
+PLAYGROUND_DIALOGUE_STATES = dict(
+    pg_audio_base64='',
+    pg_audio_array=np.array([]),
+    pg_messages=[]
+)
+
+
+VOICE_CHAT_DIALOGUE_STATES = dict(
+    vc_audio_base64='',
+    vc_audio_array=np.array([]),
+    vc_messages=[],
+    vc_model_messages=[]
+)
+
+
+AGENT_DIALOGUE_STATES = dict(
+    ag_audio_base64='',
+    ag_audio_array=np.array([]),
+    ag_visited_query_indices=[],
+    ag_messages=[], 
+    ag_model_messages=[]
+)
+
+
+COMMON_DIALOGUE_STATES = dict(
+    disprompt=False,
+    new_prompt="",
+    new_vi_array=np.array([]),
+    new_vi_base64="",
+    on_select=False, 
+    on_upload=False, 
+    on_record=False, 
+    on_select_quick_action=False,
+    on_record_voice_instruction=False
+)
+
+
+DEFAULT_DIALOGUE_STATE_DICTS = [
+    PLAYGROUND_DIALOGUE_STATES,
+    VOICE_CHAT_DIALOGUE_STATES,
+    AGENT_DIALOGUE_STATES,
+    COMMON_DIALOGUE_STATES
+]
+
+
+MODEL_NAMES = OrderedDict({
+        "llm": {
+            "vllm_name": "MERaLiON-Gemma",
+            "model_name": "MERaLiON-Gemma",
+            "ui_name": "LLM"
+        },
+        "audiollm": {
+            "vllm_name": "MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION-it",
+            "model_name": "MERaLiON-AudioLLM-Whisper-SEA-LION-it",
+            "ui_name": "AudioLLM"
+        }
+})
+
+
+AUDIO_SAMPLES_W_INSTRUCT = {
+    "7_ASR_IMDA_PART3_30_ASR_v2_2269": {
+        "apperance": "7. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Need this talk written down, please."
+        ]
+    },
+    "11_ASR_IMDA_PART4_30_ASR_v2_3771": {
+        "apperance": "11. Automatic Speech Recognition task: conversation with Singlish code-switch",
+        "instructions": [
+            "Write out the dialogue as text."
+        ]
+    },
+    "12_ASR_IMDA_PART4_30_ASR_v2_103": {
+        "apperance": "12. Automatic Speech Recognition task: conversation with Singlish code-switch",
+        "instructions": [
+            "Write out the dialogue as text."
+        ]
+    },
+    "17_ASR_IMDA_PART6_30_ASR_v2_1413": {
+        "apperance": "17. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Record the spoken word in text form."
+        ]
+    },
+    "32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572": {
+        "apperance": "32. Spoken Question Answering task: general speech",
+        "instructions": [
+            "What does the man think the woman should do at 4:00."
+        ]
+    },
+    "33_SQA_IMDA_PART3_30_SQA_V2_2310": {
+        "apperance": "33. Spoken Question Answering task: conversation in Singapore accent",
+        "instructions": [
+            "Does Speaker2's wife cook for Speaker2 when they are at home."
+        ]
+    },
+    "34_SQA_IMDA_PART3_30_SQA_V2_3621": {
+        "apperance": "34. Spoken Question Answering task: conversation in Singapore accent",
+        "instructions": [
+            "Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language."
+        ]
+    },
+    "35_SQA_IMDA_PART3_30_SQA_V2_4062": {
+        "apperance": "35. Spoken Question Answering task: conversation in Singapore accent",
+        "instructions": [
+            "What is the color of the vase mentioned in the dialogue."
+        ]
+    },
+    "36_DS_IMDA_PART4_30_DS_V2_849": {
+        "apperance": "36. Spoken Dialogue Summarization task: conversation with Singlish code-switch",
+        "instructions": [
+            "Condense the dialogue into a concise summary highlighting major topics and conclusions."
+        ]
+    },
+    "39_Paralingual_IEMOCAP_ER_V2_91": {
+        "apperance": "39. Paralinguistics task: general speech",
+        "instructions": [
+            "Based on the speaker's speech patterns, what do you think they are feeling."
+        ]
+    },
+    "40_Paralingual_IEMOCAP_ER_V2_567": {
+        "apperance": "40. Paralinguistics task: general speech",
+        "instructions": [
+            "Based on the speaker's speech patterns, what do you think they are feeling."
+        ]
+    },
+    "42_Paralingual_IEMOCAP_GR_V2_320": {
+        "apperance": "42. Paralinguistics task: general speech",
+        "instructions": [
+            "Is it possible for you to identify whether the speaker in this recording is male or female."
+        ]
+    },
+    "47_Paralingual_IMDA_PART3_30_NR_V2_10479": {
+        "apperance": "47. Paralinguistics task: conversation in Singapore accent",
+        "instructions": [
+            "Can you guess which ethnic group this person is from based on their accent."
+        ]
+    },
+    "49_Paralingual_MELD_ER_V2_676": {
+        "apperance": "49. Paralinguistics task: general speech",
+        "instructions": [
+            "What emotions do you think the speaker is expressing."
+        ]
+    },
+    "50_Paralingual_MELD_ER_V2_692": {
+        "apperance": "50. Paralinguistics task: general speech",
+        "instructions": [
+            "Based on the speaker's speech patterns, what do you think they are feeling."
+        ]
+    },
+    "51_Paralingual_VOXCELEB1_GR_V2_2148": {
+        "apperance": "51. Paralinguistics task: general speech",
+        "instructions": [
+            "May I know the gender of the speaker."
+        ]
+    },
+    "53_Paralingual_VOXCELEB1_NR_V2_2286": {
+        "apperance": "53. Paralinguistics task: general speech",
+        "instructions": [
+            "What's the nationality identity of the speaker."
+        ]
+    },
+    "55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2": {
+        "apperance": "55. Spoken Question Answering task: general speech",
+        "instructions": [
+            "What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth."
+        ]
+    },
+    "56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415": {
+        "apperance": "56. Spoken Question Answering task: general speech",
+        "instructions": [
+            "Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore."
+        ]
+    },
+    "57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460": {
+        "apperance": "57. Spoken Question Answering task: general speech",
+        "instructions": [
+            "How does the author respond to parents' worries about masks in schools."
+        ]
+    },
+    "1_ASR_IMDA_PART1_ASR_v2_141": {
+        "apperance": "1. Automatic Speech Recognition task: phonetically balanced reading",
+        "instructions": [
+            "Turn the spoken language into a text format.",
+            "Please translate the content into Chinese."
+        ]
+    },
+    "2_ASR_IMDA_PART1_ASR_v2_2258": {
+        "apperance": "2. Automatic Speech Recognition task: phonetically balanced reading",
+        "instructions": [
+            "Turn the spoken language into a text format.",
+            "Please translate the content into Chinese."
+        ]
+    },
+    "3_ASR_IMDA_PART1_ASR_v2_2265": {
+        "apperance": "3. Automatic Speech Recognition task: phonetically balanced reading",
+        "instructions": [
+            "Turn the spoken language into a text format."
+        ]
+    },
+    "4_ASR_IMDA_PART2_ASR_v2_999": {
+        "apperance": "4. Automatic Speech Recognition task: reading in Singapore context",
+        "instructions": [
+            "Translate the spoken words into text format."
+        ]
+    },
+    "5_ASR_IMDA_PART2_ASR_v2_2241": {
+        "apperance": "5. Automatic Speech Recognition task: reading in Singapore context",
+        "instructions": [
+            "Translate the spoken words into text format."
+        ]
+    },
+    "6_ASR_IMDA_PART2_ASR_v2_3409": {
+        "apperance": "6. Automatic Speech Recognition task: reading in Singapore context",
+        "instructions": [
+            "Translate the spoken words into text format."
+        ]
+    },
+    "8_ASR_IMDA_PART3_30_ASR_v2_1698": {
+        "apperance": "8. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Need this talk written down, please."
+        ]
+    },
+    "9_ASR_IMDA_PART3_30_ASR_v2_2474": {
+        "apperance": "9. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Need this talk written down, please."
+        ]
+    },
+    "10_ASR_IMDA_PART4_30_ASR_v2_1527": {
+        "apperance": "10. Automatic Speech Recognition task: conversation with Singlish code-switch",
+        "instructions": [
+            "Write out the dialogue as text."
+        ]
+    },
+    "13_ASR_IMDA_PART5_30_ASR_v2_1446": {
+        "apperance": "13. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Translate this vocal recording into a textual format."
+        ]
+    },
+    "14_ASR_IMDA_PART5_30_ASR_v2_2281": {
+        "apperance": "14. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Translate this vocal recording into a textual format."
+        ]
+    },
+    "15_ASR_IMDA_PART5_30_ASR_v2_4388": {
+        "apperance": "15. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Translate this vocal recording into a textual format."
+        ]
+    },
+    "16_ASR_IMDA_PART6_30_ASR_v2_576": {
+        "apperance": "16. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Record the spoken word in text form."
+        ]
+    },
+    "18_ASR_IMDA_PART6_30_ASR_v2_2834": {
+        "apperance": "18. Automatic Speech Recognition task: conversation in Singapore accent",
+        "instructions": [
+            "Record the spoken word in text form."
+        ]
+    },
+    "19_ASR_AIShell_zh_ASR_v2_5044": {
+        "apperance": "19. Automatic Speech Recognition task: speech in Chinese ",
+        "instructions": [
+            "Transform the oral presentation into a text document."
+        ]
+    },
+    "20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833": {
+        "apperance": "20. Automatic Speech Recognition task: general speech",
+        "instructions": [
+            "Please provide a written transcription of the speech."
+        ]
+    },
+    "25_ST_COVOST2_ZH-CN_EN_ST_V2_4567": {
+        "apperance": "25. Speech Translation task: Chinese to English",
+        "instructions": [
+            "Please translate the given speech to English."
+        ]
+    },
+    "26_ST_COVOST2_EN_ZH-CN_ST_V2_5422": {
+        "apperance": "26. Speech Translation task: English to Chinese",
+        "instructions": [
+            "Please translate the given speech to Chinese."
+        ]
+    },
+    "27_ST_COVOST2_EN_ZH-CN_ST_V2_6697": {
+        "apperance": "27. Speech Translation task: English to Chinese",
+        "instructions": [
+            "Please translate the given speech to Chinese."
+        ]
+    },
+    "28_SI_ALPACA-GPT4-AUDIO_SI_V2_299": {
+        "apperance": "28. Speech Instruction task: general speech",
+        "instructions": [
+            "Please follow the instruction in the speech."
+        ]
+    },
+    "29_SI_ALPACA-GPT4-AUDIO_SI_V2_750": {
+        "apperance": "29. Speech Instruction task: general speech",
+        "instructions": [
+            "Please follow the instruction in the speech."
+        ]
+    },
+    "30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454": {
+        "apperance": "30. Speech Instruction task: general speech",
+        "instructions": [
+            "Please follow the instruction in the speech."
+        ]
+    }
+}
+
+
+# exec(os.getenv('APP_CONFIGS'))
+
+
+def reset_states(*state_dicts):
+    for states in state_dicts:
+        st.session_state.update(copy.deepcopy(states))
+    st.session_state.update(copy.deepcopy(COMMON_DIALOGUE_STATES))
+
+
+def process_audio_bytes(audio_bytes):
+    origin_audio_array = bytes_to_array(audio_bytes)
+    truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
+    truncated_audio_bytes = array_to_bytes(truncated_audio_array)
+    audio_base64 = base64.b64encode(truncated_audio_bytes).decode('utf-8')
+
+    return origin_audio_array, audio_base64
+
+
+def update_voice_instruction_state(voice_bytes):
+    st.session_state.new_vi_array, st.session_state.new_vi_base64 = \
+        process_audio_bytes(voice_bytes)
+
+
+def init_state_section():
+    st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
+
+    st.markdown(
+        (
+            '<style>' + \
+            open('./style/app_style.css').read() + \
+            open('./style/normal_window.css').read() + \
+            open('./style/small_window.css').read() + \
+            '</style>'
+        ), 
+        unsafe_allow_html=True
+    )
+
+    if "logger" not in st.session_state:
+        st.session_state.logger = load_logger()
+        st.session_state.session_id = st.session_state.logger.register_session()
+
+    # if "server" not in st.session_state:
+    #     st.session_state.server = start_server()
+
+    # if "client_mapper" not in st.session_state:
+    #     st.session_state.client_mapper = load_model()
+
+    # if "retriever" not in st.session_state:
+    #     st.session_state.retriever = load_retriever()
+
+    for key, value in FIXED_GENERATION_CONFIG.items():
+        if key not in st.session_state:
+            st.session_state[key]=copy.deepcopy(value)
+
+    for states in DEFAULT_DIALOGUE_STATE_DICTS:
+        for key, value in states.items():
+            if key not in st.session_state:
+                st.session_state[key]=copy.deepcopy(value)
+
+
+def header_section(component_name, description="", concise_description="", icon="🤖"):
+    st.markdown(
+        f"<h1 style='text-align: center;'>MERaLiON-AudioLLM {component_name} {icon}</h1>", 
+        unsafe_allow_html=True
+        )
+    
+    st.markdown(
+        f"""<div class="main-intro-normal-window">
+        <p>This {component_name.lower()} is based on 
+        <a href="https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION" 
+        target="_blank" rel="noopener noreferrer"> MERaLiON-AudioLLM</a>, 
+        developed by I2R, A*STAR, in collaboration with AISG, Singapore. 
+        {description}</p></div>""", 
+        unsafe_allow_html=True
+        )
+    
+    st.markdown(
+        f"""<div class="main-intro-small-window">
+        <p>This {component_name.lower()} is based on 
+        <a href="https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION" 
+        target="_blank" rel="noopener noreferrer"> MERaLiON-AudioLLM</a>.{concise_description}</p></div>""", 
+        unsafe_allow_html=True
+        )
+
+
+@st.fragment
+def sidebar_fragment():
+    with st.container(height=256, border=False):
+        st.page_link("pages/playground.py", disabled=st.session_state.disprompt, label="🚀 Playground")
+        st.page_link("pages/agent.py", disabled=st.session_state.disprompt, label="👥 Chatbot")
+        st.page_link("pages/voice_chat.py", disabled=st.session_state.disprompt, label="🗣️ Voice Chat")
+    
+    st.divider()
+
+    st.slider(label='Temperature', min_value=0.0, max_value=2.0, value=0.1, key='temperature')
+
+    st.slider(label='Top P', min_value=0.0, max_value=1.0, value=0.9, key='top_p')
+
+    st.slider(label="Repetition Penalty", min_value=1.0, max_value=1.2, value=1.1, key="repetition_penalty")
+
+
+@st.fragment
+def successful_example_section(audio_sample_names, audio_array_state, audio_base64_state, restore_state={}):    
+    st.markdown(":fire: **Successful Tasks and Examples**")
+    
+    sample_name = st.selectbox(
+        label="**Select Audio:**",
+        label_visibility="collapsed",
+        options=audio_sample_names,
+        format_func=lambda o: AUDIO_SAMPLES_W_INSTRUCT[o]["apperance"],
+        index=None,
+        placeholder="Select an audio sample:",
+        on_change=lambda: st.session_state.update(
+            on_select=True, 
+            disprompt=True,
+            **copy.deepcopy(restore_state)
+        ),
+        key='select')
+    
+    if sample_name and st.session_state.on_select:
+        audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
+        st.session_state.update(
+            on_select=False,
+            new_prompt=AUDIO_SAMPLES_W_INSTRUCT[sample_name]["instructions"][0]
+        )
+        st.session_state[audio_array_state], st.session_state[audio_base64_state] = \
+            process_audio_bytes(audio_bytes)
+        st.rerun(scope="app")
+
+
+@st.dialog("Specify audio context for analysis")
+def audio_attach_dialogue(audio_array_state, audio_base64_state, restore_state={}):
+    st.markdown("**Upload**")
+
+    uploaded_file = st.file_uploader(
+        label="**Upload Audio:**", 
+        label_visibility="collapsed",
+        type=['wav', 'mp3'],
+        on_change=lambda: st.session_state.update(
+            on_upload=True, 
+            **copy.deepcopy(restore_state)
+            ),
+        key='upload'
+    )
+    
+    if uploaded_file and st.session_state.on_upload:
+        audio_bytes = uploaded_file.read()
+        st.session_state[audio_array_state], st.session_state[audio_base64_state] = \
+            process_audio_bytes(audio_bytes)
+        st.session_state.on_upload = False
+        st.rerun()
+
+    st.markdown("**Record**")
+    
+    uploaded_file = st.audio_input(
+        label="**Record Audio:**",
+        label_visibility="collapsed",
+        on_change=lambda: st.session_state.update(
+            on_record=True,
+            **copy.deepcopy(restore_state)
+            ),
+        key='record'
+    )
+    
+    if uploaded_file and st.session_state.on_record:
+        audio_bytes = uploaded_file.read()
+        st.session_state[audio_array_state], st.session_state[audio_base64_state] = \
+            process_audio_bytes(audio_bytes)
+        st.session_state.on_record = False
+        st.rerun()
+
+
+def retrive_response_with_ui(
+        model_name: str,
+        text_input: str, 
+        array_audio_input: np.ndarray, 
+        base64_audio_input: str, 
+        prefix: str = "", 
+        stream: bool = True, 
+        normalise_response: bool = False,
+        history: Optional[List] = None, 
+        show_warning: bool = True,
+        **kwargs
+    ):
+    
+    if history is None:
+        history = []
+
+    # Prepare request data
+    request_data = {
+        "text_input": str(text_input),
+        "model_name": str(model_name),
+        "array_audio_input": array_audio_input.tolist(),  # Convert numpy array to list
+        "base64_audio_input": str(base64_audio_input) if base64_audio_input else None,
+        "history": list(history) if history else None,
+        "stream": bool(stream),
+        "max_completion_tokens": int(st.session_state.max_completion_tokens),
+        "temperature": float(st.session_state.temperature),
+        "top_p": float(st.session_state.top_p),
+        "repetition_penalty": float(st.session_state.repetition_penalty),
+        "top_k": int(st.session_state.top_k),
+        "length_penalty": float(st.session_state.length_penalty),
+        "seed": int(st.session_state.seed),
+        "extra_params": {}
+    }
+
+    # print(request_data)
+    print(model_name)
+
+    error_msg = ""
+    warnings = []
+    response = ""
+
+    try:
+        if stream:
+            # Streaming response
+            response_stream = requests.post(f"{API_BASE_URL}chat", json=request_data, stream=True)
+            response_stream.raise_for_status()
+            
+            response_obj = itertools.chain([prefix], (chunk.decode() for chunk in response_stream))
+            response = st.write_stream(response_obj)
+        else:
+            # Non-streaming response
+            api_response = requests.post(f"{API_BASE_URL}chat", json=request_data)
+            api_response.raise_for_status()
+            result = api_response.json()
+            
+            if "warnings" in result:
+                warnings = result["warnings"]
+            
+            response = result.get("response", "")
+            if normalise_response:
+                response = postprocess_voice_transcription(response)
+            response = prefix + response
+            st.write(response)
+
+    except requests.exceptions.RequestException as e:
+        error_msg = f"API request failed: {str(e)}"
+        st.error(error_msg)
+
+    if show_warning:
+        for warning_msg in warnings:
+            st.warning(warning_msg)
+
+    st.session_state.logger.register_query(
+        session_id=st.session_state.session_id,
+        base64_audio=base64_audio_input,
+        text_input=text_input,
+        history=history,
+        params=request_data["extra_params"],
+        response=response,
+        warnings=warnings,
+        error_msg=error_msg
+    )
+
+    return error_msg, warnings, response
\ No newline at end of file
diff --git a/src/content/playground.py b/src/content/playground.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4416039d7095750dd5968325b6f3449e673b0df
--- /dev/null
+++ b/src/content/playground.py
@@ -0,0 +1,229 @@
+import numpy as np
+import streamlit as st
+
+from src.content.common import (
+    MODEL_NAMES,
+    AUDIO_SAMPLES_W_INSTRUCT, 
+    PLAYGROUND_DIALOGUE_STATES,
+    reset_states,
+    update_voice_instruction_state,
+    init_state_section,
+    header_section,
+    sidebar_fragment,
+    successful_example_section,
+    audio_attach_dialogue,
+    retrive_response_with_ui
+)
+
+
+QUICK_ACTIONS = [
+    {
+        "name": "**Summary**",
+        "instruction": "Please summarise this speech.",
+        "width": 10, 
+    },
+    {
+        "name": "**Transcript**",
+        "instruction": "Please transcribe this speech.",
+        "width": 9.5,
+    }
+]
+
+
+PG_CONVERSATION_STATES = dict(
+    pg_messages=[],
+)
+
+
+@st.fragment
+def select_model_variants_fradment():
+    display_mapper = {
+        value["vllm_name"]: value["ui_name"] 
+        for key, value in MODEL_NAMES.items()
+        if "audiollm" in key
+        }
+
+    st.selectbox(
+        label=":fire: Explore more MERaLiON-AudioLLM variants!",
+        options=list(display_mapper.keys()),
+        index=0,
+        format_func=lambda o: display_mapper[o],
+        key="pg_model_name",
+        placeholder=":fire: Explore more MERaLiON-AudioLLM variants!",
+        disabled=st.session_state.disprompt,
+    )
+
+
+def bottom_input_section():
+    select_model_variants_fradment()
+
+    bottom_cols = st.columns([0.03, 0.03, 0.91, 0.03])
+    with bottom_cols[0]:
+        st.button(
+            ':material/delete:', 
+            disabled=st.session_state.disprompt,
+            on_click=lambda: reset_states(PLAYGROUND_DIALOGUE_STATES)
+        )
+
+    with bottom_cols[1]:
+        if st.button(":material/add:", disabled=st.session_state.disprompt):
+            audio_attach_dialogue(
+                audio_array_state="pg_audio_array",
+                audio_base64_state="pg_audio_base64",
+                restore_state=PG_CONVERSATION_STATES
+            )
+
+    with bottom_cols[2]:
+        if chat_input := st.chat_input(
+            placeholder="Instruction...", 
+            disabled=st.session_state.disprompt, 
+            on_submit=lambda: st.session_state.update(
+                disprompt=True, 
+                **PG_CONVERSATION_STATES
+            )
+        ):
+            st.session_state.new_prompt = chat_input
+
+    with bottom_cols[3]:
+        uploaded_voice = st.audio_input(
+            label="voice_instruction",
+            label_visibility="collapsed", 
+            disabled=st.session_state.disprompt, 
+            on_change=lambda: st.session_state.update(
+                disprompt=True,
+                on_record_voice_instruction=True,
+                **PG_CONVERSATION_STATES
+                ),
+            key='voice_instruction'  
+        )
+
+        if uploaded_voice and st.session_state.on_record_voice_instruction:
+            voice_bytes = uploaded_voice.read()
+            update_voice_instruction_state(voice_bytes)
+            st.session_state.on_record_voice_instruction = False
+
+        
+@st.fragment
+def quick_actions_fragment():
+    action_cols_spec = [_["width"] for _ in QUICK_ACTIONS]
+    action_cols = st.columns(action_cols_spec)
+
+    for idx, action in enumerate(QUICK_ACTIONS):
+        action_cols[idx].button(
+            action["name"], 
+            args=(action["instruction"],),
+            disabled=st.session_state.disprompt, 
+            on_click=lambda p: st.session_state.update(
+                disprompt=True, 
+                pg_messages=[],
+                new_prompt=p, 
+                on_select_quick_action=True
+            )
+        )
+    
+    if st.session_state.on_select_quick_action:
+        st.session_state.on_select_quick_action = False
+        st.rerun(scope="app")
+
+
+def conversation_section():
+    if st.session_state.pg_audio_array.size:
+        with st.chat_message("user"):
+            st.audio(st.session_state.pg_audio_array, format="audio/wav", sample_rate=16000)
+            quick_actions_fragment()
+
+    for message in st.session_state.pg_messages:
+        with st.chat_message(message["role"]):
+            if message.get("error"):
+                st.error(message["error"])
+            for warning_msg in message.get("warnings", []):
+                st.warning(warning_msg)
+            if message.get("content"):
+                st.write(message["content"])
+    
+    with st._bottom:
+        bottom_input_section()
+
+    if (not st.session_state.new_prompt) and (not st.session_state.new_vi_base64):
+        return
+    
+    one_time_prompt = st.session_state.new_prompt
+    one_time_vi_array = st.session_state.new_vi_array
+    one_time_vi_base64 = st.session_state.new_vi_base64
+
+    st.session_state.update(
+        new_prompt="", 
+        new_vi_array=np.array([]),
+        new_vi_base64="",
+        pg_messages=[]
+    )
+
+    with st.chat_message("user"):
+        if one_time_vi_base64:
+            with st.spinner("Transcribing..."):
+                error_msg, warnings, one_time_prompt = retrive_response_with_ui(
+                    model_name=MODEL_NAMES["audiollm"]["vllm_name"],
+                    text_input="Write out the dialogue as text.", 
+                    array_audio_input=one_time_vi_array,
+                    base64_audio_input=one_time_vi_base64,
+                    stream=False,
+                    normalise_response=True
+                )
+        else:
+            error_msg, warnings = "", []
+            st.write(one_time_prompt)
+
+    st.session_state.pg_messages.append({
+        "role": "user", 
+        "error": error_msg,
+        "warnings": warnings, 
+        "content": one_time_prompt
+    })
+
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            error_msg, warnings, response = retrive_response_with_ui(
+                model_name=st.session_state.pg_model_name,
+                text_input=one_time_prompt, 
+                array_audio_input=st.session_state.pg_audio_array,
+                base64_audio_input=st.session_state.pg_audio_base64, 
+                stream=True
+            )
+
+    st.session_state.pg_messages.append({
+        "role": "assistant", 
+        "error": error_msg,
+        "warnings": warnings, 
+        "content": response
+    })
+
+    st.session_state.disprompt=False
+    st.rerun(scope="app")
+
+
+def playground_page():
+    init_state_section()
+    header_section(
+        component_name="Playground",
+        description=""" It is tailored for Singapore’s multilingual and multicultural landscape.
+        MERaLiON-AudioLLM supports 
+        <strong>Automatic Speech Recognition</strong>, 
+        <strong>Speech Translation</strong>, 
+        <strong>Spoken Question Answering</strong>,
+        <strong>Spoken Dialogue Summarization</strong>, 
+        <strong>Speech Instruction</strong>, and 
+        <strong>Paralinguistics</strong> tasks.""",
+        concise_description=""
+        )
+
+    with st.sidebar:
+        sidebar_fragment()
+
+    audio_sample_names = [name for name in AUDIO_SAMPLES_W_INSTRUCT.keys()]
+    successful_example_section(
+        audio_sample_names, 
+        audio_array_state="pg_audio_array",
+        audio_base64_state="pg_audio_base64",
+        restore_state=PG_CONVERSATION_STATES
+    )
+    conversation_section()  
\ No newline at end of file
diff --git a/src/content/voice_chat.py b/src/content/voice_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d0237f47b61a90e2140ca75461a9575bafcee11
--- /dev/null
+++ b/src/content/voice_chat.py
@@ -0,0 +1,154 @@
+import numpy as np
+import streamlit as st
+
+from src.generation import (
+    prepare_multimodal_content, 
+    change_multimodal_content
+)
+from src.content.common import (
+    MODEL_NAMES,
+    VOICE_CHAT_DIALOGUE_STATES,
+    reset_states,
+    process_audio_bytes,
+    init_state_section,
+    header_section,
+    sidebar_fragment,
+    retrive_response_with_ui
+)
+
+
+# TODO: change this.
+DEFAULT_PROMPT = "Based on the information in this user’s voice, please reply to the user in a friendly and helpful way."
+MAX_VC_ROUNDS = 5
+
+
+def bottom_input_section():
+    bottom_cols = st.columns([0.03, 0.97])
+    with bottom_cols[0]:
+        st.button(
+            ':material/delete:', 
+            disabled=st.session_state.disprompt,
+            on_click=lambda: reset_states(VOICE_CHAT_DIALOGUE_STATES)
+        )
+
+    with bottom_cols[1]:
+        uploaded_file = st.audio_input(
+            label="record audio",
+            label_visibility="collapsed",
+            disabled=st.session_state.disprompt,
+            on_change=lambda: st.session_state.update(
+                on_record=True, 
+                disprompt=True
+                ),
+            key='record'
+        )
+
+        if uploaded_file and st.session_state.on_record:
+            audio_bytes = uploaded_file.read()
+            st.session_state.vc_audio_array, st.session_state.vc_audio_base64 = \
+                process_audio_bytes(audio_bytes)
+            st.session_state.update(
+                on_record=False,
+            )
+
+
+@st.fragment
+def system_prompt_fragment():
+    with st.expander("System Prompt"):
+        st.text_area(
+            label="Insert system instructions or background knowledge here.",
+            label_visibility="collapsed",
+            disabled=st.session_state.disprompt,
+            max_chars=5000,
+            key="system_prompt",
+            value=DEFAULT_PROMPT,
+        )
+
+
+def conversation_section():
+    chat_message_container = st.container(height=480)
+    for message in st.session_state.vc_messages:
+        with chat_message_container.chat_message(message["role"]):
+            if message.get("error"):
+                st.error(message["error"])
+            for warning_msg in message.get("warnings", []):
+                st.warning(warning_msg)
+            if message.get("audio", np.array([])).shape[0]:
+                st.audio(message["audio"], format="audio/wav", sample_rate=16000)
+            if message.get("content"):
+                st.write(message["content"])
+    
+    with st._bottom:
+        bottom_input_section()
+
+    if not st.session_state.vc_audio_base64:
+        return
+
+    if len(st.session_state.vc_messages) >= MAX_VC_ROUNDS * 2:
+        st.toast(f":warning: max conversation rounds ({MAX_VC_ROUNDS}) reached!")
+        return
+
+    one_time_prompt = DEFAULT_PROMPT
+    one_time_array = st.session_state.vc_audio_array
+    one_time_base64 = st.session_state.vc_audio_base64
+    st.session_state.update(
+        vc_audio_array=np.array([]),
+        vc_audio_base64="",
+    )
+
+    with chat_message_container.chat_message("user"):
+        st.audio(one_time_array, format="audio/wav", sample_rate=16000)
+    
+    st.session_state.vc_messages.append({"role": "user", "audio": one_time_array})
+
+    if not st.session_state.vc_model_messages:
+        one_time_prompt = st.session_state.system_prompt
+    else:
+        st.session_state.vc_model_messages[0]["content"] = change_multimodal_content(
+            st.session_state.vc_model_messages[0]["content"],
+            text_input=st.session_state.system_prompt
+        )
+
+    with chat_message_container.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            error_msg, warnings, response = retrive_response_with_ui(
+                model_name=MODEL_NAMES["audiollm-it"]["vllm_name"],
+                text_input=one_time_prompt, 
+                array_audio_input=one_time_array,
+                base64_audio_input=one_time_base64, 
+                stream=True,
+                history=st.session_state.vc_model_messages
+            )
+
+    st.session_state.vc_messages.append({
+        "role": "assistant", 
+        "error": error_msg,
+        "warnings": warnings, 
+        "content": response
+    })
+
+    mm_content = prepare_multimodal_content(one_time_prompt, one_time_base64)
+    st.session_state.vc_model_messages.extend([
+        {"role": "user", "content": mm_content},
+        {"role": "assistant", "content": response}
+    ])
+
+    st.session_state.disprompt=False
+    st.rerun(scope="app")
+
+
+def voice_chat_page():
+    init_state_section()
+    header_section(
+        component_name="Voice Chat",
+        description=""" Currently support up to <strong>5 rounds</strong> of conversations.
+        Feel free to talk about anything.""",
+        concise_description=" Currently support up to <strong>5 rounds</strong> of conversations.",
+        icon="🗣️"
+        )
+
+    with st.sidebar:
+        sidebar_fragment()
+
+    system_prompt_fragment()
+    conversation_section()  
\ No newline at end of file
diff --git a/src/exceptions.py b/src/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..a939af4bd90c5f30609d62b695c305dab1d6c674
--- /dev/null
+++ b/src/exceptions.py
@@ -0,0 +1,2 @@
+class NoAudioException(Exception):
+    pass
\ No newline at end of file
diff --git a/src/generation.py b/src/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..debef67a3c05081a8cf69142554b8f1b760ef5e4
--- /dev/null
+++ b/src/generation.py
@@ -0,0 +1,47 @@
+FIXED_GENERATION_CONFIG = dict(
+    max_completion_tokens=1024,
+    top_k=50,
+    length_penalty=1.0,
+    seed=42
+)
+
+MAX_AUDIO_LENGTH = 120
+
+
+
+def prepare_multimodal_content(text_input, base64_audio_input):
+    return [
+        {
+            "type": "text",
+            "text": f"Text instruction: {text_input}"
+        },
+        {
+            "type": "audio_url",
+            "audio_url": {
+                "url": f"data:audio/ogg;base64,{base64_audio_input}"
+            },
+        },
+    ]
+
+
+def change_multimodal_content(
+        original_content, 
+        text_input="", 
+        base64_audio_input=""):
+    
+    # Since python 3.7 dictionary is ordered. 
+    if text_input:
+        original_content[0] = {
+            "type": "text",
+            "text": f"Text instruction: {text_input}"
+        }
+
+    if base64_audio_input:
+        original_content[1] = {
+            "type": "audio_url",
+            "audio_url": {
+                "url": f"data:audio/ogg;base64,{base64_audio_input}"
+            }
+        }
+
+    return original_content
diff --git a/src/logger.py b/src/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..426bee2e36d246e79fd1e990c9181893987b9ef7
--- /dev/null
+++ b/src/logger.py
@@ -0,0 +1,111 @@
+import io
+import os
+import time
+import json
+from threading import Thread, Lock
+
+import streamlit as st
+from huggingface_hub import HfApi
+
+from src.utils import get_current_strftime
+
+
+logger_lock = Lock()
+
+
+def threaded(fn):
+    def wrapper(*args, **kwargs):
+        thread = Thread(target=fn, args=args, kwargs=kwargs)
+        thread.start()
+        return thread
+    return wrapper
+
+
+class Logger:
+    def __init__(self):
+        self.app_id = get_current_strftime()
+        self.session_increment = 0
+        self.query_increment = 0
+        self.sync_interval = 180
+
+        self.session_data = []
+        self.query_data = []
+        self.audio_data = []
+
+        self.sync_data()
+
+    def register_session(self) -> str:
+        new_session_id = f"{self.app_id}+{self.session_increment}"
+        with logger_lock:
+            self.session_data.append({
+                "session_id": new_session_id,
+                "creation_time": get_current_strftime()
+            })
+
+            self.session_increment += 1
+        return new_session_id
+
+    def register_query(self, 
+                       session_id, 
+                       base64_audio, 
+                       text_input, 
+                       response,
+                       **kwargs
+                       ):
+        new_query_id = self.query_increment
+        current_time = get_current_strftime()
+
+        with logger_lock:
+            current_query_data = {
+                "session_id": session_id,
+                "query_id": new_query_id,
+                "creation_time": current_time,
+                "text": text_input,
+                "response": response,
+            }
+            current_query_data.update(kwargs)
+            self.query_data.append(current_query_data)
+
+            self.audio_data.append({
+                "session_id": session_id,
+                "query_id": new_query_id,
+                "creation_time": current_time,
+                "audio": base64_audio,
+            })
+            self.query_increment += 1
+
+
+    @threaded
+    def sync_data(self):
+        api = HfApi()
+
+        while True:
+            time.sleep(self.sync_interval)
+
+            for data_name in ["session_data", "query_data", "audio_data"]:
+                with logger_lock:
+                    last_data = getattr(self, data_name, [])
+                    setattr(self, data_name, [])
+
+                if not last_data:
+                    continue
+
+                buffer = io.BytesIO()
+                for row in last_data:
+                    row_str = json.dumps(row, ensure_ascii=False)+"\n"
+                    buffer.write(row_str.encode("utf-8"))
+
+                api.upload_file(
+                    path_or_fileobj=buffer,
+                    path_in_repo=f"{data_name}/{get_current_strftime()}.json",
+                    repo_id=os.getenv("LOGGING_REPO_NAME"),
+                    repo_type="dataset",
+                    token=os.getenv('HF_TOKEN')
+                )
+
+                buffer.close()
+
+
+@st.cache_resource()
+def load_logger():
+    return Logger()
\ No newline at end of file
diff --git a/src/retrieval.py b/src/retrieval.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf033e938cd527c0533329ffc41e434912c78786
--- /dev/null
+++ b/src/retrieval.py
@@ -0,0 +1,38 @@
+STANDARD_QUERIES = [
+    {
+        "query_text": "Please transcribe this speech.",
+        "doc_text": "Listen to a speech and write down exactly what is being said in text form. It's essentially converting spoken words into written words. Provide the exact transcription of the given audio. Record whatever the speaker has said into written text.",
+        "response_prefix_text": "The transcription of the speech is: ",
+        "ui_text": "speech trancription"
+    },
+    {
+        "query_text": "Please describe what happended in this audio",
+        "doc_text": "Text captions describing the sound events and environments in the audio clips, describing the events and actions happened in the audio.",
+        "response_prefix_text": "Events in this audio clip: ",
+        "ui_text": "audio caption"
+    },
+    {
+        "query_text": "May I know the gender of the speakers",
+        "doc_text": "Identify the gender, male or female, based on pitch, formants, harmonics, and prosody features, and other speech pattern differences between genders.",
+        "response_prefix_text": "By analyzing pitch, formants, harmonics, and prosody features, which reflect physiological and speech pattern differences between genders: ",
+        "ui_text": "gender recognition"
+    },
+    {
+        "query_text": "May I know the nationality of the speakers",
+        "doc_text": "Discover speakers' nationality, country, or the place he is coming from, from his/her accent, pronunciation patterns, and other language-specific speech features influenced by cultural and linguistic backgrounds.",
+        "response_prefix_text": "By analyzing accent, pronunciation patterns, intonation, rhythm, phoneme usage, and language-specific speech features influenced by cultural and linguistic backgrounds: ",
+        "ui_text": "natinoality recognition"
+    },
+    {
+        "query_text": "Can you guess which ethnic group this person is from based on their accent.",
+        "doc_text": "Discover speakers' ethnic group, home country, or the place he is coming from, from his/her accent, tone, and other vocal characteristics influenced by cultural, regional, and linguistic factors.",
+        "response_prefix_text": "By analyzing speech features like accent, tone, intonation, phoneme variations, and vocal characteristics influenced by cultural, regional, and linguistic factors: ",
+        "ui_text": "ethnic group recognition"
+    },
+    {
+        "query_text": "What do you think the speakers are feeling.",
+        "doc_text": "What do you think the speakers are feeling. Please identify speakers' emotions by analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy, which reflect emotional states such as happiness, anger, sadness, or fear.",
+        "response_prefix_text": "By analyzing vocal features like pitch, tone, volume, speech rate, rhythm, and spectral energy: ",
+        "ui_text": "emotion recognition"
+    },
+]
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd7011f2abc1ada13463d7dec3945f2f991ed76
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,31 @@
+import io
+import re
+from datetime import datetime
+from scipy.io.wavfile import write
+
+import librosa
+
+
+def get_current_strftime():
+    return datetime.now().strftime(r'%d-%m-%y-%H-%M-%S')
+
+
+def bytes_to_array(audio_bytes):
+    audio_array, _ = librosa.load(
+        io.BytesIO(audio_bytes), 
+        sr=16000
+    )
+    return audio_array
+
+
+def array_to_bytes(audio_array):
+    bytes_wav = bytes()
+    byte_io = io.BytesIO(bytes_wav)
+    write(byte_io, 16000, audio_array)
+    return byte_io.read()
+
+
+def postprocess_voice_transcription(text):
+    text = re.sub("<.*>:?|\(.*\)|\[.*\]", "", text)
+    text = re.sub("\s+", " ", text).strip()
+    return text
\ No newline at end of file
diff --git a/style/app_style.css b/style/app_style.css
new file mode 100644
index 0000000000000000000000000000000000000000..405194228b62cf9c87586085fd4a174ddf37e20c
--- /dev/null
+++ b/style/app_style.css
@@ -0,0 +1,150 @@
+div[data-testid="stMainBlockContainer"] {
+    padding-top: 2rem;
+    padding-bottom: 1rem;
+}
+
+div[data-testid="stMainBlockContainer"]:has( div[height="480"][data-testid="stVerticalBlockBorderWrapper"]) {
+    height: calc(100% - 90px);
+}
+
+div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"] {
+    height: 100%;
+}
+
+div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"]>div {
+    height: 100%;
+}
+
+div[data-testid="stMainBlockContainer"]>div[data-testid="stVerticalBlockBorderWrapper"]>div>div {
+    height: 100%;
+}
+
+div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
+    max-height: 3rem;
+}
+
+div[data-testid="stMainBlockContainer"] h1 {
+    padding-top: 0.25rem;
+}
+
+div[class="sidebar-intro"] p {
+    margin-bottom: 0.75rem;
+}
+
+[class='stAudio'] {
+    max-width: 500px !important;
+    margin: auto !important;
+}
+
+div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) {
+    flex-direction: row-reverse;
+    text-align: right;
+}
+
+div[height="480"][data-testid="stVerticalBlockBorderWrapper"] {
+    height: 100%;
+    min-height: 20px;
+}
+
+/* audio quick actions */
+
+div[data-testid="stChatMessage"] div[data-testid="stVerticalBlock"]:has( audio[data-testid="stAudio"]) {
+    gap: 2px;
+}
+
+div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) {
+    flex-direction: row-reverse;
+    gap: 4px;
+}
+
+div[data-testid="stChatMessage"] div[data-testid="stHorizontalBlock"]>div[data-testid="stColumn"]:has( div[data-testid="stButton"]) {
+    width: 6rem;
+    min-width: 6rem;
+    flex: 0 0 6rem;
+}
+
+/* File uploader */
+
+section[data-testid='stFileUploaderDropzone'] {
+    padding:6px 2rem;
+}
+
+section[data-testid='stFileUploaderDropzone']>button {
+    display:none;
+}
+
+div[data-testid="stFileUploaderDropzoneInstructions"]>div>span {
+    display:none;
+}
+
+div[data-testid="stBottomBlockContainer"] {
+    padding-bottom: 2rem;
+}
+
+/* Chat input component at the bottom */
+
+div[data-testid="stBottomBlockContainer"] div[data-testid="stHorizontalBlock"]:has(> div[data-testid="stColumn"]) {
+    gap: 4px;
+}
+
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):first-of-type {
+    width: 42px;
+    min-width: 42px;
+    flex: 0 0 42px;
+}
+
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stButton"]):nth-of-type(2) {
+    width: 42px;
+    min-width: 42px;
+    flex: 0 0 42px;
+}
+
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stChatInput"]) {
+    width: 10rem;
+    min-width: 10rem;
+    flex: 1 1 10rem;
+}
+
+div[data-testid="stBottomBlockContainer"] div[data-testid="stColumn"]:has( div[data-testid="stAudioInput"]) {
+    width: 10rem;
+    min-width: 10rem;
+    flex: 1 1 10rem;
+}
+
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div {
+    max-height: 40px;
+}
+
+/* Mic Button */
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stAudioInput"]>div {
+    display: block;
+    padding: 0;
+    margin: auto;
+}
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stAudioInput"]>div>div:last-of-type {
+    display:none;
+}
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stAudioInput"]>div>div:nth-of-type(2) {
+    margin:auto;
+}
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:last-of-type {
+    display:none;
+}
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:only-of-type {
+    display:block;
+}
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stAudioInput"]>div>span {
+    display:none;
+}
+
+div[data-testid="stBottomBlockContainer"]:has( div[data-testid="stChatInput"]) div[data-testid="stColumn"]:has( div[data-testid="stAudioInput"]) {
+    width: 24px;
+    min-width: 24px;
+    flex: 0 0 24px;
+}
\ No newline at end of file
diff --git a/style/normal_window.css b/style/normal_window.css
new file mode 100644
index 0000000000000000000000000000000000000000..f66ffb8ca6da843c656cdb7ec5f6b4fac2283552
--- /dev/null
+++ b/style/normal_window.css
@@ -0,0 +1,18 @@
+@media(min-width: 800px) {
+    div[data-testid="stMainBlockContainer"] {
+        padding-left: 5rem;
+        padding-right: 5rem;
+    }
+
+    div[data-testid="stBottomBlockContainer"] {
+        padding-left: 5rem;
+        padding-right: 5rem;
+    }
+}
+
+
+@media(min-width: 800px) and (min-height: 800px) {
+    div[class="main-intro-small-window"] {
+        display: none;
+    }
+}
\ No newline at end of file
diff --git a/style/small_window.css b/style/small_window.css
new file mode 100644
index 0000000000000000000000000000000000000000..63875f8d6a6ef7872462517ddc5cdee3859ec342
--- /dev/null
+++ b/style/small_window.css
@@ -0,0 +1,25 @@
+@media(max-width: 800px) {
+    div[data-testid="stMainBlockContainer"] {
+        padding-left: 1rem;
+        padding-right: 1rem;
+    }
+
+    div[data-testid="stBottomBlockContainer"] {
+        padding-left: 1rem;
+        padding-right: 1rem;
+    }
+
+    div[data-testid="stSidebarCollapsedControl"] button[data-testid="stBaseButton-headerNoPadding"]::after {
+        content: "More Use Cases"
+    }
+}
+
+@media(max-width: 800px) or (max-height: 800px) {
+    div[data-testid="stMainBlockContainer"] div[data-testid="stVerticalBlock"]>div[data-testid="stElementContainer"]:has( div[data-testid="stHeadingWithActionElements"]) {
+        display: none;
+    }
+
+    div[class="main-intro-normal-window"] {
+        display: none;
+    }
+}
\ No newline at end of file