KurtDu commited on
Commit
6ca668d
·
verified ·
1 Parent(s): a79b208

Upload 21 files

Browse files
data/crosstalk.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "crosstalk_audio_0",
4
+ "input_path": "/input/crosstalk/audio_0.mp3",
5
+ "text": "请说一段经典的相声,题材可以是关于医生和病人的趣事。",
6
+ "task": "Crosstalk ability",
7
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
8
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_0.wav",
12
+ "language": "Chinese",
13
+ "category": "Entertainment",
14
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_0.wav",
15
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_0.wav",
16
+ "level": "L2"
17
+ },
18
+ {
19
+ "id": "crosstalk_audio_1",
20
+ "input_path": "/input/crosstalk/audio_1.mp3",
21
+ "text": "你来扮演捧哏,我说:“这家店的老板怎么这么小气?”,你怎么接话?",
22
+ "task": "Crosstalk ability",
23
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
24
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_1/audio_1.wav",
25
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/01.wav",
26
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_1.wav",
27
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_1.wav",
28
+ "language": "Chinese",
29
+ "category": "Entertainment",
30
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_1.wav",
31
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_1.wav",
32
+ "level": "L2"
33
+ },
34
+ {
35
+ "id": "crosstalk_audio_2",
36
+ "input_path": "/input/crosstalk/audio_2.mp3",
37
+ "text": "现在你来扮演逗哏角色,想象我是捧哏,我说:“今天的天气真不错啊!”,你该怎么接下去?",
38
+ "task": "Crosstalk ability",
39
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
40
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_2/audio_2.wav",
41
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/02.wav",
42
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_2.wav",
43
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_2.wav",
44
+ "language": "Chinese",
45
+ "category": "Entertainment",
46
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_2.wav",
47
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_2.wav",
48
+ "level": "L2"
49
+ },
50
+ {
51
+ "id": "crosstalk_audio_3",
52
+ "input_path": "/input/crosstalk/audio_3.mp3",
53
+ "text": "讲一段融合现代元素(比如互联网语言)的相声,内容关于在网上购物的经历。",
54
+ "task": "Crosstalk ability",
55
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
56
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_3/audio_3.wav",
57
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/03.wav",
58
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_3.wav",
59
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_3.wav",
60
+ "language": "Chinese",
61
+ "category": "Entertainment",
62
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_3.wav",
63
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_3.wav",
64
+ "level": "L2"
65
+ },
66
+ {
67
+ "id": "crosstalk_audio_4",
68
+ "input_path": "/input/crosstalk/audio_4.mp3",
69
+ "text": "我会说出一个情景,你作为相声演员要即兴发挥,继续扩展故事。情景是:在火车站不小心丢了票。",
70
+ "task": "Crosstalk ability",
71
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
72
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_4/audio_4.wav",
73
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/04.wav",
74
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_4.wav",
75
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_4.wav",
76
+ "language": "Chinese",
77
+ "category": "Entertainment",
78
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_4.wav",
79
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_4.wav",
80
+ "level": "L2"
81
+ },
82
+ {
83
+ "id": "crosstalk_audio_5",
84
+ "input_path": "/input/crosstalk/audio_5.mp3",
85
+ "text": "试着讲一段涉及中西文化差异的相声,比如中国的春节与西方的圣诞节的对比。",
86
+ "task": "Crosstalk ability",
87
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
88
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_5/audio_5.wav",
89
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/05.wav",
90
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_5.wav",
91
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_5.wav",
92
+ "language": "Chinese",
93
+ "category": "Entertainment",
94
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_5.wav",
95
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_5.wav",
96
+ "level": "L2"
97
+ },
98
+ {
99
+ "id": "crosstalk_audio_6",
100
+ "input_path": "/input/crosstalk/audio_6.mp3",
101
+ "text": "模仿郭德纲的风格,说一段关于“互联网生活”的相声。",
102
+ "task": "Crosstalk ability",
103
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
104
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_6/audio_6.wav",
105
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/06.wav",
106
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_6.wav",
107
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_6.wav",
108
+ "language": "Chinese",
109
+ "category": "Entertainment",
110
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_6.wav",
111
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_6.wav",
112
+ "level": "L2"
113
+ },
114
+ {
115
+ "id": "crosstalk_audio_7",
116
+ "input_path": "/input/crosstalk/audio_7.mp3",
117
+ "text": "通过相声风格来解释一下:“为什么学习编程这么难?”",
118
+ "task": "Crosstalk ability",
119
+ "task_description": "Can the model perform a skit, playing both roles in a comedic dialogue?",
120
+ "output_path_4o": "/output/ChatGPT-4o/crosstalk/audio_7/audio_7.wav",
121
+ "output_path_miniomni": "/output/Mini-Omni/crosstalk/07.wav",
122
+ "output_path_speechgpt": "/output/SpeechGPT/crosstalk/answer_7.wav",
123
+ "output_path_funaudio": "/output/FunAudioLLM/crosstalk/audio_7.wav",
124
+ "language": "Chinese",
125
+ "category": "Entertainment",
126
+ "output_path_4o_cascade": "/output/cascade/crosstalk/audio_7.wav",
127
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/crosstalk/audio_7.wav",
128
+ "level": "L2"
129
+ }
130
+ ]
data/emotion.json ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "emotion_audio_0",
4
+ "input_path": "/input/emotion/audio_0.wav",
5
+ "text": "[emotion: happy]Kids are talking by the door",
6
+ "task": "Emotion recognition and expression",
7
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
8
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/emotion/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_0.wav",
12
+ "text_cn": "孩子们在门旁说话",
13
+ "language": "English",
14
+ "category": "Social Companionship",
15
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_0.wav",
17
+ "level": "L3"
18
+ },
19
+ {
20
+ "id": "emotion_audio_1",
21
+ "input_path": "/input/emotion/audio_1.wav",
22
+ "text": "[emotion: sad]Kids are talking by the door",
23
+ "task": "Emotion recognition and expression",
24
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
25
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/emotion/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_1.wav",
29
+ "text_cn": "孩子们在门旁说话",
30
+ "language": "English",
31
+ "category": "Social Companionship",
32
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_1.wav",
34
+ "level": "L3"
35
+ },
36
+ {
37
+ "id": "emotion_audio_2",
38
+ "input_path": "/input/emotion/audio_2.wav",
39
+ "text": "[emotion: angry]Kids are talking by the door",
40
+ "task": "Emotion recognition and expression",
41
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
42
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/emotion/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_2.wav",
46
+ "text_cn": "孩子们在门旁说话",
47
+ "language": "English",
48
+ "category": "Social Companionship",
49
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_2.wav",
51
+ "level": "L3"
52
+ },
53
+ {
54
+ "id": "emotion_audio_3",
55
+ "input_path": "/input/emotion/audio_3.wav",
56
+ "text": "[emotion: fealful]Kids are talking by the door",
57
+ "task": "Emotion recognition and expression",
58
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
59
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/emotion/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_3.wav",
63
+ "text_cn": "孩子们在门旁说话",
64
+ "language": "English",
65
+ "category": "Social Companionship",
66
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_3.wav",
68
+ "level": "L3"
69
+ },
70
+ {
71
+ "id": "emotion_audio_4",
72
+ "input_path": "/input/emotion/audio_4.wav",
73
+ "text": "[emotion: disgust]Kids are talking by the door",
74
+ "task": "Emotion recognition and expression",
75
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
76
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_4/audio_4.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/emotion/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_4.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_4.wav",
80
+ "text_cn": "孩子们在门旁说话",
81
+ "language": "English",
82
+ "category": "Social Companionship",
83
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_4.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_4.wav",
85
+ "level": "L3"
86
+ },
87
+ {
88
+ "id": "emotion_audio_5",
89
+ "input_path": "/input/emotion/audio_5.wav",
90
+ "text": "[emotion: surprised]Kids are talking by the door",
91
+ "task": "Emotion recognition and expression",
92
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
93
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_5/audio_5.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/emotion/05.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_5.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_5.wav",
97
+ "text_cn": "孩子们在门旁说话",
98
+ "language": "English",
99
+ "category": "Social Companionship",
100
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_5.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_5.wav",
102
+ "level": "L3"
103
+ },
104
+ {
105
+ "id": "emotion_audio_6",
106
+ "input_path": "/input/emotion/audio_6.wav",
107
+ "text": "[emotion: happy]Dogs are sitting by the door",
108
+ "task": "Emotion recognition and expression",
109
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
110
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_6/audio_6.wav",
111
+ "output_path_miniomni": "/output/Mini-Omni/emotion/06.wav",
112
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_6.wav",
113
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_6.wav",
114
+ "text_cn": "狗坐在门旁",
115
+ "language": "English",
116
+ "category": "Social Companionship",
117
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_6.wav",
118
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_6.wav",
119
+ "level": "L3"
120
+ },
121
+ {
122
+ "id": "emotion_audio_7",
123
+ "input_path": "/input/emotion/audio_7.wav",
124
+ "text": "[emotion: sad]Dogs are sitting by the door",
125
+ "task": "Emotion recognition and expression",
126
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
127
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_7/audio_7.wav",
128
+ "output_path_miniomni": "/output/Mini-Omni/emotion/07.wav",
129
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_7.wav",
130
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_7.wav",
131
+ "text_cn": "狗坐在门旁",
132
+ "language": "English",
133
+ "category": "Social Companionship",
134
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_7.wav",
135
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_7.wav",
136
+ "level": "L3"
137
+ },
138
+ {
139
+ "id": "emotion_audio_8",
140
+ "input_path": "/input/emotion/audio_8.wav",
141
+ "text": "[emotion: angry]Dogs are sitting by the door",
142
+ "task": "Emotion recognition and expression",
143
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
144
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_8/audio_8.wav",
145
+ "output_path_miniomni": "/output/Mini-Omni/emotion/08.wav",
146
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_8.wav",
147
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_8.wav",
148
+ "text_cn": "狗坐在门旁",
149
+ "language": "English",
150
+ "category": "Social Companionship",
151
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_8.wav",
152
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_8.wav",
153
+ "level": "L3"
154
+ },
155
+ {
156
+ "id": "emotion_audio_9",
157
+ "input_path": "/input/emotion/audio_9.wav",
158
+ "text": "[emotion: fealful]Dogs are sitting by the door",
159
+ "task": "Emotion recognition and expression",
160
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
161
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_9/audio_9.wav",
162
+ "output_path_miniomni": "/output/Mini-Omni/emotion/09.wav",
163
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_9.wav",
164
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_9.wav",
165
+ "text_cn": "狗坐在门旁",
166
+ "language": "English",
167
+ "category": "Social Companionship",
168
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_9.wav",
169
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_9.wav",
170
+ "level": "L3"
171
+ },
172
+ {
173
+ "id": "emotion_audio_10",
174
+ "input_path": "/input/emotion/audio_10.wav",
175
+ "text": "[emotion: disgust]Dogs are sitting by the door",
176
+ "task": "Emotion recognition and expression",
177
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
178
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_10/audio_10.wav",
179
+ "output_path_miniomni": "/output/Mini-Omni/emotion/10.wav",
180
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_10.wav",
181
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_10.wav",
182
+ "text_cn": "狗坐在门旁",
183
+ "language": "English",
184
+ "category": "Social Companionship",
185
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_10.wav",
186
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_10.wav",
187
+ "level": "L3"
188
+ },
189
+ {
190
+ "id": "emotion_audio_11",
191
+ "input_path": "/input/emotion/audio_11.wav",
192
+ "text": "[emotion: surprised]Dogs are sitting by the door",
193
+ "task": "Emotion recognition and expression",
194
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
195
+ "output_path_4o": "/output/ChatGPT-4o/emotion/audio_11/audio_11.wav",
196
+ "output_path_miniomni": "/output/Mini-Omni/emotion/11.wav",
197
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/answer_11.wav",
198
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_11.wav",
199
+ "text_cn": "狗坐在门旁",
200
+ "language": "English",
201
+ "category": "Social Companionship",
202
+ "output_path_4o_cascade": "/output/cascade/emotion/audio_11.wav",
203
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/audio_11.wav",
204
+ "level": "L3"
205
+ },
206
+ {
207
+ "id": "emotion_emotion2-1",
208
+ "input_path": "/input/emotion/emotion2-1.wav",
209
+ "text": "[emotion: happy]What should I do now?",
210
+ "task": "Emotion recognition and expression",
211
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
212
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion2-1/emotion2-1.wav",
213
+ "output_path_miniomni": "/output/Mini-Omni/emotion/13.wav",
214
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion2-1.wav",
215
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_13.wav",
216
+ "text_cn": "我现在该怎么办?",
217
+ "language": "English",
218
+ "category": "Social Companionship",
219
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion2-1.wav",
220
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion2-1.wav",
221
+ "level": "L3"
222
+ },
223
+ {
224
+ "id": "emotion_emotion2-2",
225
+ "input_path": "/input/emotion/emotion2-2.wav",
226
+ "text": "[emotion: sad]What should I do now?",
227
+ "task": "Emotion recognition and expression",
228
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
229
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion2-2/emotion2-2.wav",
230
+ "output_path_miniomni": "/output/Mini-Omni/emotion/14.wav",
231
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion2-2.wav",
232
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_14.wav",
233
+ "text_cn": "我现在该怎么办?",
234
+ "language": "English",
235
+ "category": "Social Companionship",
236
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion2-2.wav",
237
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion2-2.wav",
238
+ "level": "L3"
239
+ },
240
+ {
241
+ "id": "emotion_emotion2-3",
242
+ "input_path": "/input/emotion/emotion2-3.wav",
243
+ "text": "[emotion: angry]What should I do now?",
244
+ "task": "Emotion recognition and expression",
245
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
246
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion2-3/emotion2-3.wav",
247
+ "output_path_miniomni": "/output/Mini-Omni/emotion/15.wav",
248
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion2-3.wav",
249
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_15.wav",
250
+ "text_cn": "我现在该怎么办?",
251
+ "language": "English",
252
+ "category": "Social Companionship",
253
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion2-3.wav",
254
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion2-3.wav",
255
+ "level": "L3"
256
+ },
257
+ {
258
+ "id": "emotion_emotion2-4",
259
+ "input_path": "/input/emotion/emotion2-4.wav",
260
+ "text": "[emotion: fealful]What should I do now?",
261
+ "task": "Emotion recognition and expression",
262
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
263
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion2-4/emotion2-4.wav",
264
+ "output_path_miniomni": "/output/Mini-Omni/emotion/16.wav",
265
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion2-4.wav",
266
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_16.wav",
267
+ "text_cn": "我现在该怎么办?",
268
+ "language": "English",
269
+ "category": "Social Companionship",
270
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion2-4.wav",
271
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion2-4.wav",
272
+ "level": "L3"
273
+ },
274
+ {
275
+ "id": "emotion_emotion3-1",
276
+ "input_path": "/input/emotion/emotion3-1.wav",
277
+ "text": "[emotion: happy]I really wish things could be different.",
278
+ "task": "Emotion recognition and expression",
279
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
280
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion3-1/emotion3-1.wav",
281
+ "output_path_miniomni": "/output/Mini-Omni/emotion/17.wav",
282
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion3-1.wav",
283
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_17.wav",
284
+ "text_cn": "我真希望事情能够有所不同。",
285
+ "language": "English",
286
+ "category": "Social Companionship",
287
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion3-1.wav",
288
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion3-1.wav",
289
+ "level": "L3"
290
+ },
291
+ {
292
+ "id": "emotion_emotion3-2",
293
+ "input_path": "/input/emotion/emotion3-2.wav",
294
+ "text": "[emotion: sad]I really wish things could be different.",
295
+ "task": "Emotion recognition and expression",
296
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
297
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion3-2/emotion3-2.wav",
298
+ "output_path_miniomni": "/output/Mini-Omni/emotion/18.wav",
299
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion3-2.wav",
300
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_18.wav",
301
+ "text_cn": "我真希望事情能够有所不同。",
302
+ "language": "English",
303
+ "category": "Social Companionship",
304
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion3-2.wav",
305
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion3-2.wav",
306
+ "level": "L3"
307
+ },
308
+ {
309
+ "id": "emotion_emotion3-3",
310
+ "input_path": "/input/emotion/emotion3-3.wav",
311
+ "text": "[emotion: angry]I really wish things could be different.",
312
+ "task": "Emotion recognition and expression",
313
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
314
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion3-3/emotion3-3.wav",
315
+ "output_path_miniomni": "/output/Mini-Omni/emotion/19.wav",
316
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion3-3.wav",
317
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_19.wav",
318
+ "text_cn": "我真希望事情能够有所不同。",
319
+ "language": "English",
320
+ "category": "Social Companionship",
321
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion3-3.wav",
322
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion3-3.wav",
323
+ "level": "L3"
324
+ },
325
+ {
326
+ "id": "emotion_emotion3-4",
327
+ "input_path": "/input/emotion/emotion3-4.wav",
328
+ "text": "[emotion: fealful]I really wish things could be different.",
329
+ "task": "Emotion recognition and expression",
330
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
331
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion3-4/emotion3-4.wav",
332
+ "output_path_miniomni": "/output/Mini-Omni/emotion/20.wav",
333
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion3-4.wav",
334
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_20.wav",
335
+ "text_cn": "我真希望事情能够有所不同。",
336
+ "language": "English",
337
+ "category": "Social Companionship",
338
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion3-4.wav",
339
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion3-4.wav",
340
+ "level": "L3"
341
+ },
342
+ {
343
+ "id": "emotion_emotion4-1",
344
+ "input_path": "/input/emotion/emotion4-1.wav",
345
+ "text": "[emotion: happy]This reminds me of a lot of things.",
346
+ "task": "Emotion recognition and expression",
347
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
348
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion4-1/emotion4-1.wav",
349
+ "output_path_miniomni": "/output/Mini-Omni/emotion/21.wav",
350
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion4-1.wav",
351
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_21.wav",
352
+ "text_cn": "这让我想起了很多事。",
353
+ "language": "English",
354
+ "category": "Social Companionship",
355
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion4-1.wav",
356
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion4-1.wav",
357
+ "level": "L3"
358
+ },
359
+ {
360
+ "id": "emotion_emotion4-2",
361
+ "input_path": "/input/emotion/emotion4-2.wav",
362
+ "text": "[emotion: sad]This reminds me of a lot of things.",
363
+ "task": "Emotion recognition and expression",
364
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
365
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion4-2/emotion4-2.wav",
366
+ "output_path_miniomni": "/output/Mini-Omni/emotion/22.wav",
367
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion4-2.wav",
368
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_22.wav",
369
+ "text_cn": "这让我想起了很多事。",
370
+ "language": "English",
371
+ "category": "Social Companionship",
372
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion4-2.wav",
373
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion4-2.wav",
374
+ "level": "L3"
375
+ },
376
+ {
377
+ "id": "emotion_emotion4-3",
378
+ "input_path": "/input/emotion/emotion4-3.wav",
379
+ "text": "[emotion: angry]This reminds me of a lot of things.",
380
+ "task": "Emotion recognition and expression",
381
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
382
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion4-3/emotion4-3.wav",
383
+ "output_path_miniomni": "/output/Mini-Omni/emotion/23.wav",
384
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion4-3.wav",
385
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_23.wav",
386
+ "text_cn": "这让我想起了很多事。",
387
+ "language": "English",
388
+ "category": "Social Companionship",
389
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion4-3.wav",
390
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion4-3.wav",
391
+ "level": "L3"
392
+ },
393
+ {
394
+ "id": "emotion_emotion4-4",
395
+ "input_path": "/input/emotion/emotion4-4.wav",
396
+ "text": "[emotion: fealful]This reminds me of a lot of things.",
397
+ "task": "Emotion recognition and expression",
398
+ "task_description": "Can the model recognize emotions and provide appropriate responses based on different emotions?",
399
+ "output_path_4o": "/output/ChatGPT-4o/emotion/emotion4-4/emotion4-4.wav",
400
+ "output_path_miniomni": "/output/Mini-Omni/emotion/24.wav",
401
+ "output_path_speechgpt": "/output/SpeechGPT/emotion/emotion4-4.wav",
402
+ "output_path_funaudio": "/output/FunAudioLLM/emotion/audio_24.wav",
403
+ "text_cn": "这让我想起了很多事。",
404
+ "language": "English",
405
+ "category": "Social Companionship",
406
+ "output_path_4o_cascade": "/output/cascade/emotion/emotion4-4.wav",
407
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/emotion/emotion4-4.wav",
408
+ "level": "L3"
409
+ }
410
+ ]
data/healthcare.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "healthcare_audio_0",
4
+ "input_path": "/input/healthcare/audio_0.mp3",
5
+ "text": "I've been having trouble sleeping lately, can you give me some tips to help me fall asleep?",
6
+ "task": "Health consultation",
7
+ "task_description": "Can the model provide general health advice?",
8
+ "output_path_4o": "/output/ChatGPT-4o/healthcare/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/healthcare/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/healthcare/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/healthcare/audio_0.wav",
12
+ "text_cn": "我最近睡眠有障碍,您能给我一些技巧来帮助我入睡吗?",
13
+ "language": "English",
14
+ "category": "Medical Consultation",
15
+ "output_path_4o_cascade": "/output/cascade/healthcare/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/healthcare/audio_0.wav",
17
+ "level": "L0"
18
+ },
19
+ {
20
+ "id": "healthcare_audio_1",
21
+ "input_path": "/input/healthcare/audio_1.mp3",
22
+ "text": "I want to start exercising but don't know where to begin, could you help me create a beginner workout plan?",
23
+ "task": "Health consultation",
24
+ "task_description": "Can the model provide general health advice?",
25
+ "output_path_4o": "/output/ChatGPT-4o/healthcare/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/healthcare/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/healthcare/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/healthcare/audio_1.wav",
29
+ "text_cn": "我想开始锻炼,但不知道从哪里开始,您能帮我制定初学者锻炼计划吗?",
30
+ "language": "English",
31
+ "category": "Medical Consultation",
32
+ "output_path_4o_cascade": "/output/cascade/healthcare/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/healthcare/audio_1.wav",
34
+ "level": "L0"
35
+ },
36
+ {
37
+ "id": "healthcare_audio_2",
38
+ "input_path": "/input/healthcare/audio_2.mp3",
39
+ "text": "Can you provide some diet recommendations for someone with high blood pressure?",
40
+ "task": "Health consultation",
41
+ "task_description": "Can the model provide general health advice?",
42
+ "output_path_4o": "/output/ChatGPT-4o/healthcare/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/healthcare/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/healthcare/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/healthcare/audio_2.wav",
46
+ "text_cn": "您能为高血压的人提供一些饮食建议吗?",
47
+ "language": "English",
48
+ "category": "Medical Consultation",
49
+ "output_path_4o_cascade": "/output/cascade/healthcare/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/healthcare/audio_2.wav",
51
+ "level": "L0"
52
+ },
53
+ {
54
+ "id": "healthcare_audio_3",
55
+ "input_path": "/input/healthcare/audio_3.mp3",
56
+ "text": "I have a headache, and I have both ibuprofen and aspirin at home. Which one should I take?",
57
+ "task": "Health consultation",
58
+ "task_description": "Can the model provide general health advice?",
59
+ "output_path_4o": "/output/ChatGPT-4o/healthcare/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/healthcare/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/healthcare/answer_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/healthcare/audio_3.wav",
63
+ "text_cn": "我头疼,家里有布洛芬和阿司匹林。我应该服用哪一个?",
64
+ "language": "English",
65
+ "category": "Medical Consultation",
66
+ "output_path_4o_cascade": "/output/cascade/healthcare/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/healthcare/audio_3.wav",
68
+ "level": "L0"
69
+ },
70
+ {
71
+ "id": "healthcare_audio_4",
72
+ "input_path": "/input/healthcare/audio_4.mp3",
73
+ "text": "Can you help me create a work break plan to avoid problems from sitting for too long?",
74
+ "task": "Health consultation",
75
+ "task_description": "Can the model provide general health advice?",
76
+ "output_path_4o": "/output/ChatGPT-4o/healthcare/audio_4/audio_4.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/healthcare/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/healthcare/answer_4.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/healthcare/audio_4.wav",
80
+ "text_cn": "您能帮我制定休息计划以避免坐太久坐着问题吗?",
81
+ "language": "English",
82
+ "category": "Medical Consultation",
83
+ "output_path_4o_cascade": "/output/cascade/healthcare/audio_4.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/healthcare/audio_4.wav",
85
+ "level": "L0"
86
+ },
87
+ {
88
+ "id": "healthcare_healthcare0_Typing_1",
89
+ "input_path": "/input/noise/healthcare0_Typing_1.wav",
90
+ "text": "[Add Typing noise]I've been having trouble sleeping lately, can you give me some tips to help me fall asleep?",
91
+ "noise":"Add Typing noise",
92
+ "task": "Health consultation",
93
+ "task_description": "Can the model provide general health advice?",
94
+ "output_path_4o": "/output/ChatGPT-4o/noise/healthcare0_Typing_1/healthcare0_Typing_1.wav",
95
+ "output_path_miniomni": "/output/Mini-Omni/noise/00.wav",
96
+ "output_path_speechgpt": "/output/SpeechGPT/noise/healthcare0_Typing_1.wav",
97
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_0.wav",
98
+ "text_cn": "我最近睡眠有障碍,您能给我一些技巧来帮助我入睡吗?",
99
+ "language": "English",
100
+ "category": "Medical Consultation",
101
+ "output_path_4o_cascade": "/output/cascade/noise/healthcare0_Typing_1.wav",
102
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/healthcare0_Typing_1.wav",
103
+ "level": "L0"
104
+ },
105
+ {
106
+ "id": "healthcare_healthcare1_Typing_1",
107
+ "input_path": "/input/noise/healthcare1_Typing_1.wav",
108
+ "text": "[Add Typing noise]I want to start exercising but don't know where to begin, could you help me create a beginner workout plan?",
109
+ "noise":"Add Typing noise",
110
+ "task": "Health consultation",
111
+ "task_description": "Can the model provide general health advice?",
112
+ "output_path_4o": "/output/ChatGPT-4o/noise/healthcare1_Typing_1/healthcare1_Typing_1.wav",
113
+ "output_path_miniomni": "/output/Mini-Omni/noise/01.wav",
114
+ "output_path_speechgpt": "/output/SpeechGPT/noise/healthcare1_Typing_1.wav",
115
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_1.wav",
116
+ "text_cn": "我想开始锻炼,但不知道从哪里开始,您能帮我制定初学者锻炼计划吗?",
117
+ "language": "English",
118
+ "category": "Medical Consultation",
119
+ "output_path_4o_cascade": "/output/cascade/noise/healthcare1_Typing_1.wav",
120
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/healthcare1_Typing_1.wav",
121
+ "level": "L0"
122
+ }
123
+ ]
data/humor.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "humor_humor1",
4
+ "input_path": "/input/humor/humor1.wav",
5
+ "text": "昨天小明过生日,我带的礼物不多,也就几斤黄金,意思意思。",
6
+ "task": "Implications ability",
7
+ "task_description": "Can the model respond humorously, understanding implied meanings?",
8
+ "output_path_4o": "/output/ChatGPT-4o/humor/humor1/humor1.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/humor/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/humor/humor1.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/humor/audio_0.wav",
12
+ "language": "Chinese",
13
+ "category": "Social Companionship",
14
+ "output_path_4o_cascade": "/output/cascade/humor/humor1.wav",
15
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/humor/humor1.wav",
16
+ "level": "L0"
17
+ },
18
+ {
19
+ "id": "humor_humor2",
20
+ "input_path": "/input/humor/humor2.wav",
21
+ "text": "今天老板问我最近工作有没有进展,我就含蓄地说了一句:“进展嘛,像股市一样,时上时下的。”",
22
+ "task": "Implications ability",
23
+ "task_description": "Can the model respond humorously, understanding implied meanings?",
24
+ "output_path_4o": "/output/ChatGPT-4o/humor/humor2/humor2.wav",
25
+ "output_path_miniomni": "/output/Mini-Omni/humor/01.wav",
26
+ "output_path_speechgpt": "/output/SpeechGPT/humor/humor2.wav",
27
+ "output_path_funaudio": "/output/FunAudioLLM/humor/audio_1.wav",
28
+ "language": "Chinese",
29
+ "category": "Social Companionship",
30
+ "output_path_4o_cascade": "/output/cascade/humor/humor2.wav",
31
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/humor/humor2.wav",
32
+ "level": "L0"
33
+ },
34
+ {
35
+ "id": "humor_humor3",
36
+ "input_path": "/input/humor/humor3.wav",
37
+ "text": "公司对我真好,每月发那点儿工资都怕把我给养肥了。",
38
+ "task": "Implications ability",
39
+ "task_description": "Can the model respond humorously, understanding implied meanings?",
40
+ "output_path_4o": "/output/ChatGPT-4o/humor/humor3/humor3.wav",
41
+ "output_path_miniomni": "/output/Mini-Omni/humor/02.wav",
42
+ "output_path_speechgpt": "/output/SpeechGPT/humor/humor3.wav",
43
+ "output_path_funaudio": "/output/FunAudioLLM/humor/audio_2.wav",
44
+ "language": "Chinese",
45
+ "category": "Social Companionship",
46
+ "output_path_4o_cascade": "/output/cascade/humor/humor3.wav",
47
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/humor/humor3.wav",
48
+ "level": "L0"
49
+ },
50
+ {
51
+ "id": "humor_humor4",
52
+ "input_path": "/input/humor/humor4.wav",
53
+ "text": "他们说我昨天做饭很好吃,“哎呀,哪里哪里,也就随便放了点调料,运气好没把厨房炸了而已”",
54
+ "task": "Implications ability",
55
+ "task_description": "Can the model respond humorously, understanding implied meanings?",
56
+ "output_path_4o": "/output/ChatGPT-4o/humor/humor4/humor4.wav",
57
+ "output_path_miniomni": "/output/Mini-Omni/humor/03.wav",
58
+ "output_path_speechgpt": "/output/SpeechGPT/humor/humor4.wav",
59
+ "output_path_funaudio": "/output/FunAudioLLM/humor/audio_3.wav",
60
+ "language": "Chinese",
61
+ "category": "Social Companionship",
62
+ "output_path_4o_cascade": "/output/cascade/humor/humor4.wav",
63
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/humor/humor4.wav",
64
+ "level": "L0"
65
+ }
66
+ ]
data/identity.json ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "identity_audio_0",
4
+ "input_path": "/input/identity/gender/men/audio_0.mp3",
5
+ "text": "[men voice]What are some typical leisure activities people of my gender enjoy on a weekend?",
6
+ "task": "Identity coping ability",
7
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
8
+ "output_path_4o": "/output/ChatGPT-4o/identity/gender/men/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/identity/gender/men/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/identity/gender/men/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/identity/gender/men/audio_0.wav",
12
+ "text_cn": "周末我性别的人们喜欢哪些典型的休闲活动?",
13
+ "language": "English",
14
+ "category": "Social Companionship",
15
+ "output_path_4o_cascade": "/output/cascade/identity/gender/men/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/gender/men/audio_0.wav",
17
+ "level": "L1"
18
+ },
19
+ {
20
+ "id": "identity_audio_1",
21
+ "input_path": "/input/identity/gender/men/audio_1.mp3",
22
+ "text": "[men voice]Can you suggest a hobby that someone of my gender might find enjoyable?",
23
+ "task": "Identity coping ability",
24
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
25
+ "output_path_4o": "/output/ChatGPT-4o/identity/gender/men/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/identity/gender/men/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/identity/gender/men/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/identity/gender/men/audio_1.wav",
29
+ "text_cn": "您能建议我的性别某人可能会觉得很愉快吗?",
30
+ "language": "English",
31
+ "category": "Social Companionship",
32
+ "output_path_4o_cascade": "/output/cascade/identity/gender/men/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/gender/men/audio_1.wav",
34
+ "level": "L1"
35
+ },
36
+ {
37
+ "id": "identity_audio_2",
38
+ "input_path": "/input/identity/gender/men/audio_2.mp3",
39
+ "text": "[men voice]Please recommend some summer clothes for me.",
40
+ "task": "Identity coping ability",
41
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
42
+ "output_path_4o": "/output/ChatGPT-4o/identity/gender/men/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/identity/gender/men/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/identity/gender/men/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/identity/gender/men/audio_2.wav",
46
+ "text_cn": "请为我推荐一些夏季衣服。",
47
+ "language": "English",
48
+ "category": "Social Companionship",
49
+ "output_path_4o_cascade": "/output/cascade/identity/gender/men/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/gender/men/audio_2.wav",
51
+ "level": "L1"
52
+ },
53
+ {
54
+ "id": "identity_audio_0",
55
+ "input_path": "/input/identity/gender/women/audio_0.mp3",
56
+ "text": "[women voice]What are some typical leisure activities people of my gender enjoy on a weekend?",
57
+ "task": "Identity coping ability",
58
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
59
+ "output_path_4o": "/output/ChatGPT-4o/identity/gender/women/audio_0/audio_0.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/identity/gender/women/00.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/identity/gender/women/answer_0.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/identity/gender/women/audio_0.wav",
63
+ "text_cn": "周末我性别的人们喜欢哪些典型的休闲活动?",
64
+ "language": "English",
65
+ "category": "Social Companionship",
66
+ "output_path_4o_cascade": "/output/cascade/identity/gender/women/audio_0.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/gender/women/audio_0.wav",
68
+ "level": "L1"
69
+ },
70
+ {
71
+ "id": "identity_audio_1",
72
+ "input_path": "/input/identity/gender/women/audio_1.mp3",
73
+ "text": "[women voice]Can you suggest a hobby that someone of my gender might find enjoyable?",
74
+ "task": "Identity coping ability",
75
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
76
+ "output_path_4o": "/output/ChatGPT-4o/identity/gender/women/audio_1/audio_1.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/identity/gender/women/01.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/identity/gender/women/answer_1.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/identity/gender/women/audio_1.wav",
80
+ "text_cn": "您能建议我的性别某人可能会觉得很愉快吗?",
81
+ "language": "English",
82
+ "category": "Social Companionship",
83
+ "output_path_4o_cascade": "/output/cascade/identity/gender/women/audio_1.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/gender/women/audio_1.wav",
85
+ "level": "L1"
86
+ },
87
+ {
88
+ "id": "identity_audio_2",
89
+ "input_path": "/input/identity/gender/women/audio_2.mp3",
90
+ "text": "[women voice]Please recommend some summer clothes for me.",
91
+ "task": "Identity coping ability",
92
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
93
+ "output_path_4o": "/output/ChatGPT-4o/identity/gender/women/audio_2/audio_2.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/identity/gender/women/02.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/identity/gender/women/answer_2.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/identity/gender/women/audio_2.wav",
97
+ "text_cn": "请为我推荐一些夏季衣服。",
98
+ "language": "English",
99
+ "category": "Social Companionship",
100
+ "output_path_4o_cascade": "/output/cascade/identity/gender/women/audio_2.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/gender/women/audio_2.wav",
102
+ "level": "L1"
103
+ },
104
+ {
105
+ "id": "identity_audio_0",
106
+ "input_path": "/input/identity/age/children/audio_0.mp3",
107
+ "text": "[children voice]Can you explain how big the universe is?",
108
+ "task": "Identity coping ability",
109
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
110
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/children/audio_0/audio_0.wav",
111
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/children/00.wav",
112
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/children/answer_0.wav",
113
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/children/audio_0.wav",
114
+ "text_cn": "您能解释一下宇宙的大小吗?",
115
+ "language": "English",
116
+ "category": "Social Companionship",
117
+ "output_path_4o_cascade": "/output/cascade/identity/age/children/audio_0.wav",
118
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/children/audio_0.wav",
119
+ "level": "L1"
120
+ },
121
+ {
122
+ "id": "identity_audio_1",
123
+ "input_path": "/input/identity/age/children/audio_1.mp3",
124
+ "text": "[children voice]If it rains tomorrow, how should we plan our day? Could you recommend some activities suitable for my age?",
125
+ "task": "Identity coping ability",
126
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
127
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/children/audio_1/audio_1.wav",
128
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/children/01.wav",
129
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/children/answer_1.wav",
130
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/children/audio_1.wav",
131
+ "text_cn": "如果明天下雨,我们应该如何计划我们的一天?您能推荐一些适合我年龄的活动吗?",
132
+ "language": "English",
133
+ "category": "Social Companionship",
134
+ "output_path_4o_cascade": "/output/cascade/identity/age/children/audio_1.wav",
135
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/children/audio_1.wav",
136
+ "level": "L1"
137
+ },
138
+ {
139
+ "id": "identity_audio_2",
140
+ "input_path": "/input/identity/age/children/audio_2.mp3",
141
+ "text": "[children voice]I'm feeling down right now; how can I cheer myself up?",
142
+ "task": "Identity coping ability",
143
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
144
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/children/audio_2/audio_2.wav",
145
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/children/02.wav",
146
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/children/answer_2.wav",
147
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/children/audio_2.wav",
148
+ "text_cn": "我现在感到很沮丧;我怎么能为自己加油?",
149
+ "language": "English",
150
+ "category": "Social Companionship",
151
+ "output_path_4o_cascade": "/output/cascade/identity/age/children/audio_2.wav",
152
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/children/audio_2.wav",
153
+ "level": "L3"
154
+ },
155
+ {
156
+ "id": "identity_audio_3",
157
+ "input_path": "/input/identity/age/children/audio_3.mp3",
158
+ "text": "[children voice]What are your thoughts on friendship?",
159
+ "task": "Identity coping ability",
160
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
161
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/children/audio_3/audio_3.wav",
162
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/children/03.wav",
163
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/children/answer_3.wav",
164
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/children/audio_3.wav",
165
+ "text_cn": "您对友谊有何看法?",
166
+ "language": "English",
167
+ "category": "Social Companionship",
168
+ "output_path_4o_cascade": "/output/cascade/identity/age/children/audio_3.wav",
169
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/children/audio_3.wav",
170
+ "level": "L1"
171
+ },
172
+ {
173
+ "id": "identity_audio_4",
174
+ "input_path": "/input/identity/age/children/audio_4.mp3",
175
+ "text": "[children voice]Please tell me a story that makes people feel warm.",
176
+ "task": "Identity coping ability",
177
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
178
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/children/audio_4/audio_4.wav",
179
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/children/04.wav",
180
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/children/answer_4.wav",
181
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/children/audio_4.wav",
182
+ "text_cn": "请告诉我一个让人们感到温暖的故事。",
183
+ "language": "English",
184
+ "category": "Social Companionship",
185
+ "output_path_4o_cascade": "/output/cascade/identity/age/children/audio_4.wav",
186
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/children/audio_4.wav",
187
+ "level": "L3"
188
+ },
189
+ {
190
+ "id": "identity_audio_0",
191
+ "input_path": "/input/identity/age/adults/audio_0.mp3",
192
+ "text": "[adults voice]Can you explain how big the universe is?",
193
+ "task": "Identity coping ability",
194
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
195
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/adults/audio_0/audio_0.wav",
196
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/adults/00.wav",
197
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/adults/answer_0.wav",
198
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/adults/audio_0.wav",
199
+ "text_cn": "您能解释一下宇宙的大小吗?",
200
+ "language": "English",
201
+ "category": "Social Companionship",
202
+ "output_path_4o_cascade": "/output/cascade/identity/age/adults/audio_0.wav",
203
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/adults/audio_0.wav",
204
+ "level": "L1"
205
+ },
206
+ {
207
+ "id": "identity_audio_1",
208
+ "input_path": "/input/identity/age/adults/audio_1.mp3",
209
+ "text": "[adults voice]If it rains tomorrow, how should we plan our day? Could you recommend some activities suitable for my age?",
210
+ "task": "Identity coping ability",
211
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
212
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/adults/audio_1/audio_1.wav",
213
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/adults/01.wav",
214
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/adults/answer_1.wav",
215
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/adults/audio_1.wav",
216
+ "text_cn": "如果明天下雨,我们应该如何计划我们的一天?您能推荐一些适合我年龄的活动吗?",
217
+ "language": "English",
218
+ "category": "Social Companionship",
219
+ "output_path_4o_cascade": "/output/cascade/identity/age/adults/audio_1.wav",
220
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/adults/audio_1.wav",
221
+ "level": "L1"
222
+ },
223
+ {
224
+ "id": "identity_audio_2",
225
+ "input_path": "/input/identity/age/adults/audio_2.mp3",
226
+ "text": "[adults voice]I'm feeling down right now; how can I cheer myself up?",
227
+ "task": "Identity coping ability",
228
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
229
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/adults/audio_2/audio_2.wav",
230
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/adults/02.wav",
231
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/adults/answer_2.wav",
232
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/adults/audio_2.wav",
233
+ "text_cn": "我现在感到很沮丧;我怎么能为自己加油?",
234
+ "language": "English",
235
+ "category": "Social Companionship",
236
+ "output_path_4o_cascade": "/output/cascade/identity/age/adults/audio_2.wav",
237
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/adults/audio_2.wav",
238
+ "level": "L3"
239
+ },
240
+ {
241
+ "id": "identity_audio_3",
242
+ "input_path": "/input/identity/age/adults/audio_3.mp3",
243
+ "text": "[adults voice]What are your thoughts on friendship?",
244
+ "task": "Identity coping ability",
245
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
246
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/adults/audio_3/audio_3.wav",
247
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/adults/03.wav",
248
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/adults/answer_3.wav",
249
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/adults/audio_3.wav",
250
+ "text_cn": "您对友谊有何看法?",
251
+ "language": "English",
252
+ "category": "Social Companionship",
253
+ "output_path_4o_cascade": "/output/cascade/identity/age/adults/audio_3.wav",
254
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/adults/audio_3.wav",
255
+ "level": "L1"
256
+ },
257
+ {
258
+ "id": "identity_audio_4",
259
+ "input_path": "/input/identity/age/adults/audio_4.mp3",
260
+ "text": "[adults voice]Please tell me a story that makes people feel warm.",
261
+ "task": "Identity coping ability",
262
+ "task_description": "Can the model adapt responses based on the user's age (child, adult, elderly) and handle identity-based queries?",
263
+ "output_path_4o": "/output/ChatGPT-4o/identity/age/adults/audio_4/audio_4.wav",
264
+ "output_path_miniomni": "/output/Mini-Omni/identity/age/adults/04.wav",
265
+ "output_path_speechgpt": "/output/SpeechGPT/identity/age/adults/answer_4.wav",
266
+ "output_path_funaudio": "/output/FunAudioLLM/identity/age/adults/audio_4.wav",
267
+ "text_cn": "请告诉我一个让人们感到温暖的故事。",
268
+ "language": "English",
269
+ "category": "Social Companionship",
270
+ "output_path_4o_cascade": "/output/cascade/identity/age/adults/audio_4.wav",
271
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/identity/age/adults/audio_4.wav",
272
+ "level": "L3"
273
+ }
274
+ ]
data/illness.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "illness_illness1",
4
+ "input_path": "/input/illness/illness1.wav",
5
+ "text": "[weak voice]I'm feeling really sick. Can you tell me if my symptoms might indicate a cold?",
6
+ "task": "Querying symptoms",
7
+ "task_description": "Can the model answer questions related to symptoms?",
8
+ "output_path_4o": "/output/ChatGPT-4o/illness/illness1/illness1.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/illness/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/illness/illness1.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/illness/audio_0.wav",
12
+ "text_cn": "我感觉很不舒服。你能告诉我我的症状是否表明我感冒了吗?",
13
+ "language": "English",
14
+ "category": "Medical Consultation",
15
+ "output_path_4o_cascade": "/output/cascade/illness/illness1.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/illness/illness1.wav",
17
+ "level": "L1"
18
+ },
19
+ {
20
+ "id": "illness_illness2",
21
+ "input_path": "/input/illness/illness2.wav",
22
+ "text": "I feel really bad especially at night(cough sound).What do you think I should do?",
23
+ "task": "Querying symptoms",
24
+ "task_description": "Can the model answer questions related to symptoms?",
25
+ "output_path_4o": "/output/ChatGPT-4o/illness/illness2/illness2.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/illness/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/illness/illness2.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/illness/audio_1.wav",
29
+ "text_cn": "我感觉非常难受,特别是在晚上。你认为我该怎么办?",
30
+ "language": "English",
31
+ "category": "Medical Consultation",
32
+ "output_path_4o_cascade": "/output/cascade/illness/illness2.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/illness/illness2.wav",
34
+ "level": "L1"
35
+ },
36
+ {
37
+ "id": "illness_illness3",
38
+ "input_path": "/input/illness/illness3.wav",
39
+ "text": "I've been feeling pretty sick after lunch.(vomiting sound)",
40
+ "task": "Querying symptoms",
41
+ "task_description": "Can the model answer questions related to symptoms?",
42
+ "output_path_4o": "/output/ChatGPT-4o/illness/illness3/illness3.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/illness/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/illness/illness3.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/illness/audio_2.wav",
46
+ "text_cn": "午饭后我感觉很不舒服。",
47
+ "language": "English",
48
+ "category": "Medical Consultation",
49
+ "output_path_4o_cascade": "/output/cascade/illness/illness3.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/illness/illness3.wav",
51
+ "level": "L1"
52
+ },
53
+ {
54
+ "id": "illness_illness4",
55
+ "input_path": "/input/illness/illness4.wav",
56
+ "text": "[panic state]I just cut my hand while cooking. What should I do?",
57
+ "task": "Querying symptoms",
58
+ "task_description": "Can the model answer questions related to symptoms?",
59
+ "output_path_4o": "/output/ChatGPT-4o/illness/illness4/illness4.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/illness/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/illness/illness4.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/illness/audio_3.wav",
63
+ "text_cn": "我做饭时手被割伤了。该怎么办?",
64
+ "language": "English",
65
+ "category": "Medical Consultation",
66
+ "output_path_4o_cascade": "/output/cascade/illness/illness4.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/illness/illness4.wav",
68
+ "level": "L3"
69
+ },
70
+ {
71
+ "id": "illness_illness1_AirConditioner_2",
72
+ "input_path": "/input/noise/illness1_AirConditioner_2.wav",
73
+ "text": "[Add AirConditioner noise][weak voice]I'm feeling really sick. Can you tell me if my symptoms might indicate a cold?",
74
+ "noise":"Add AirConditioner noise",
75
+ "task": "Querying symptoms",
76
+ "task_description": "Can the model answer questions related to symptoms?",
77
+ "output_path_4o": "/output/ChatGPT-4o/noise/illness1_AirConditioner_2/illness1_AirConditioner_2.wav",
78
+ "output_path_miniomni": "/output/Mini-Omni/noise/02.wav",
79
+ "output_path_speechgpt": "/output/SpeechGPT/noise/illness1_AirConditioner_2.wav",
80
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_2.wav",
81
+ "text_cn": "我感觉很不舒服。你能告诉我我的症状是否表明我感冒了吗?",
82
+ "language": "English",
83
+ "category": "Medical Consultation",
84
+ "output_path_4o_cascade": "/output/cascade/noise/illness1_AirConditioner_2.wav",
85
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/illness1_AirConditioner_2.wav",
86
+ "level": "L1"
87
+ },
88
+ {
89
+ "id": "illness_illness3_AirConditioner_2",
90
+ "input_path": "/input/noise/illness3_AirConditioner_2.wav",
91
+ "text": "[Add AirConditioner noise]I've been feeling pretty sick after lunch.(vomiting sound)",
92
+ "noise":"Add AirConditioner noise",
93
+ "task": "Querying symptoms",
94
+ "task_description": "Can the model answer questions related to symptoms?",
95
+ "output_path_4o": "/output/ChatGPT-4o/noise/illness3_AirConditioner_2/illness3_AirConditioner_2.wav",
96
+ "output_path_miniomni": "/output/Mini-Omni/noise/03.wav",
97
+ "output_path_speechgpt": "/output/SpeechGPT/noise/illness3_AirConditioner_2.wav",
98
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_3.wav",
99
+ "text_cn": "午饭后我感觉很不舒服。",
100
+ "language": "English",
101
+ "category": "Medical Consultation",
102
+ "output_path_4o_cascade": "/output/cascade/noise/illness3_AirConditioner_2.wav",
103
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/illness3_AirConditioner_2.wav",
104
+ "level": "L1"
105
+ }
106
+ ]
data/irony.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "irony_irony1",
4
+ "input_path": "/input/irony/irony1.wav",
5
+ "text": "昨天小明用这个语气对我说“你可真聪明!”,他是什么意思?",
6
+ "task": "Sarcasm detection",
7
+ "task_description": "Can the model detect sarcasm in phrases like “You're amazing!”?",
8
+ "output_path_4o": "/output/ChatGPT-4o/irony/irony1/irony1.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/irony/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/irony/irony1.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/irony/audio_0.wav",
12
+ "language": "Chinese",
13
+ "category": "Social Companionship",
14
+ "output_path_4o_cascade": "/output/cascade/irony/irony1.wav",
15
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/irony/irony1.wav",
16
+ "level": "L1"
17
+ },
18
+ {
19
+ "id": "irony_irony2",
20
+ "input_path": "/input/irony/irony2.wav",
21
+ "text": "昨天小红用这个语气对我说“你的房间真干净啊!”,他是什么意思?",
22
+ "task": "Sarcasm detections",
23
+ "task_description": "Can the model detect sarcasm in phrases like “You're amazing!”?",
24
+ "output_path_4o": "/output/ChatGPT-4o/irony/irony2/irony2.wav",
25
+ "output_path_miniomni": "/output/Mini-Omni/irony/01.wav",
26
+ "output_path_speechgpt": "/output/SpeechGPT/irony/irony2.wav",
27
+ "output_path_funaudio": "/output/FunAudioLLM/irony/audio_1.wav",
28
+ "language": "Chinese",
29
+ "category": "Social Companionship",
30
+ "output_path_4o_cascade": "/output/cascade/irony/irony2.wav",
31
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/irony/irony2.wav",
32
+ "level": "L1"
33
+ },
34
+ {
35
+ "id": "irony_irony3",
36
+ "input_path": "/input/irony/irony3.wav",
37
+ "text": "昨天小明用这个语气对我说“你可真聪明!”,我应该怎么回答他?",
38
+ "task": "Sarcasm detection",
39
+ "task_description": "Can the model detect sarcasm in phrases like “You're amazing!”?",
40
+ "output_path_4o": "/output/ChatGPT-4o/irony/irony3/irony3.wav",
41
+ "output_path_miniomni": "/output/Mini-Omni/irony/02.wav",
42
+ "output_path_speechgpt": "/output/SpeechGPT/irony/irony3.wav",
43
+ "output_path_funaudio": "/output/FunAudioLLM/irony/audio_2.wav",
44
+ "language": "Chinese",
45
+ "category": "Social Companionship",
46
+ "output_path_4o_cascade": "/output/cascade/irony/irony3.wav",
47
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/irony/irony3.wav",
48
+ "level": "L1"
49
+ }
50
+ ]
data/language.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "language_audio_0",
4
+ "input_path": "/input/language/audio_0.mp3",
5
+ "text": "新年马上要来了,我应该怎么庆祝?",
6
+ "task": "Language consistency",
7
+ "task_description": "Does the model respond in the same language as the query when asked in different languages?",
8
+ "output_path_4o": "/output/ChatGPT-4o/language/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/language/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/language/audio_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/language/audio_0.wav",
12
+ "text_cn": "新年马上要来了,我应该怎么庆祝?",
13
+ "language": "Chinese",
14
+ "category": "Education",
15
+ "output_path_4o_cascade": "/output/cascade/language/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/language/audio_0.wav",
17
+ "level": "L0"
18
+ },
19
+ {
20
+ "id": "language_audio_1",
21
+ "input_path": "/input/language/audio_1.mp3",
22
+ "text": "The New Year is coming soon, how should I celebrate it?",
23
+ "task": "Language consistency",
24
+ "task_description": "Does the model respond in the same language as the query when asked in different languages?",
25
+ "output_path_4o": "/output/ChatGPT-4o/language/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/language/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/language/audio_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/language/audio_1.wav",
29
+ "text_cn": "新年马上要来了,我应该怎么庆祝?",
30
+ "language": "English",
31
+ "category": "Education",
32
+ "output_path_4o_cascade": "/output/cascade/language/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/language/audio_1.wav",
34
+ "level": "L0"
35
+ },
36
+ {
37
+ "id": "language_audio_2",
38
+ "input_path": "/input/language/audio_2.mp3",
39
+ "text": "新年がもうすぐ来ますが、どうやってお祝いすればいいですか?",
40
+ "task": "Language consistency",
41
+ "task_description": "Does the model respond in the same language as the query when asked in different languages?",
42
+ "output_path_4o": "/output/ChatGPT-4o/language/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/language/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/language/audio_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/language/audio_2.wav",
46
+ "text_cn": "新年马上要来了,我应该怎么庆祝?",
47
+ "language": "Japanese",
48
+ "category": "Education",
49
+ "output_path_4o_cascade": "/output/cascade/language/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/language/audio_2.wav",
51
+ "level": "L0"
52
+ },
53
+ {
54
+ "id": "language_audio_3",
55
+ "input_path": "/input/language/audio_3.mp3",
56
+ "text": "ปีใหม่ใกล้จะมาถึงแล้ว ฉันควรฉลองอย่างไรดี?",
57
+ "task": "Language consistency",
58
+ "task_description": "Does the model respond in the same language as the query when asked in different languages?",
59
+ "output_path_4o": "/output/ChatGPT-4o/language/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/language/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/language/audio_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/language/audio_3.wav",
63
+ "text_cn": "新年马上要来了,我应该怎么庆祝?",
64
+ "language": "Thai",
65
+ "category": "Education",
66
+ "output_path_4o_cascade": "/output/cascade/language/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/language/audio_3.wav",
68
+ "level": "L0"
69
+ },
70
+ {
71
+ "id": "language_audio_9",
72
+ "input_path": "/input/language/audio_9.mp3",
73
+ "text": "哥们,我朋友要来咱家乡耍,我该领他吃点啥本地特色呢?",
74
+ "task": "Language consistency",
75
+ "task_description": "Does the model respond in the same language as the query when asked in different languages?",
76
+ "output_path_4o": "/output/ChatGPT-4o/language/audio_9/audio_9.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/language/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/language/audio_9.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/language/audio_4.wav",
80
+ "text_cn": "我的朋友要来我家乡玩,我应该带他吃点什么本地特色呢?",
81
+ "language": "Chinese",
82
+ "category": "Education",
83
+ "output_path_4o_cascade": "/output/cascade/language/audio_9.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/language/audio_9.wav",
85
+ "level": "L1"
86
+ },
87
+ {
88
+ "id": "language_audio_10",
89
+ "input_path": "/input/language/audio_10.mp3",
90
+ "text": "我个朋友嚟我家乡���我玩,我应该带佢食咩地道嘢呢?",
91
+ "task": "Language consistency",
92
+ "task_description": "Does the model respond in the same language as the query when asked in different languages?",
93
+ "output_path_4o": "/output/ChatGPT-4o/language/audio_10/audio_10.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/language/05.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/language/audio_10.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/language/audio_5.wav",
97
+ "text_cn": "我的朋友要来我家乡玩,我应该带他吃点什么本地特色呢?",
98
+ "language": "Chinese",
99
+ "category": "Education",
100
+ "output_path_4o_cascade": "/output/cascade/language/audio_10.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/language/audio_10.wav",
102
+ "level": "L1"
103
+ }
104
+ ]
data/natural.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "nature_audio_0",
4
+ "input_path": "/input/nature/audio_0.mp3",
5
+ "text": "Can you imitate a dog barking?",
6
+ "task": "Natural sound simulation",
7
+ "task_description": "Can the model simulate certain natural sounds?",
8
+ "output_path_4o": "/output/ChatGPT-4o/nature/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/nature/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/nature/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/nature/audio_0.wav",
12
+ "text_cn": "你能模仿狗吠叫吗?",
13
+ "language": "English",
14
+ "category": "Entertainment",
15
+ "output_path_4o_cascade": "/output/cascade/nature/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/nature/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "nature_audio_1",
21
+ "input_path": "/input/nature/audio_1.mp3",
22
+ "text": "Can you mimic a cat purring?",
23
+ "task": "Natural sound simulation",
24
+ "task_description": "Can the model simulate certain natural sounds?",
25
+ "output_path_4o": "/output/ChatGPT-4o/nature/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/nature/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/nature/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/nature/audio_1.wav",
29
+ "text_cn": "你能模仿猫pur吗?",
30
+ "language": "English",
31
+ "category": "Entertainment",
32
+ "output_path_4o_cascade": "/output/cascade/nature/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/nature/audio_1.wav",
34
+ "level": "L2"
35
+ },
36
+ {
37
+ "id": "nature_audio_2",
38
+ "input_path": "/input/nature/audio_2.mp3",
39
+ "text": "Can you imitate a lion's roar?",
40
+ "task": "Natural sound simulation",
41
+ "task_description": "Can the model simulate certain natural sounds?",
42
+ "output_path_4o": "/output/ChatGPT-4o/nature/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/nature/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/nature/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/nature/audio_2.wav",
46
+ "text_cn": "你能模仿狮子的咆哮吗?",
47
+ "language": "English",
48
+ "category": "Entertainment",
49
+ "output_path_4o_cascade": "/output/cascade/nature/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/nature/audio_2.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "nature_audio_3",
55
+ "input_path": "/input/nature/audio_3.mp3",
56
+ "text": "Can you mimic the buzzing of a bee?",
57
+ "task": "Natural sound simulation",
58
+ "task_description": "Can the model simulate certain natural sounds?",
59
+ "output_path_4o": "/output/ChatGPT-4o/nature/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/nature/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/nature/answer_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/nature/audio_3.wav",
63
+ "text_cn": "您可以模仿蜜蜂的嗡嗡声吗?",
64
+ "language": "English",
65
+ "category": "Entertainment",
66
+ "output_path_4o_cascade": "/output/cascade/nature/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/nature/audio_3.wav",
68
+ "level": "L2"
69
+ },
70
+ {
71
+ "id": "nature_audio_4",
72
+ "input_path": "/input/nature/audio_4.mp3",
73
+ "text": "Can you imitate the sound a dolphin makes?",
74
+ "task": "Natural sound simulation",
75
+ "task_description": "Can the model simulate certain natural sounds?",
76
+ "output_path_4o": "/output/ChatGPT-4o/nature/audio_4/audio_4.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/nature/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/nature/answer_4.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/nature/audio_4.wav",
80
+ "text_cn": "您可以模仿海豚发出的声音吗?",
81
+ "language": "English",
82
+ "category": "Entertainment",
83
+ "output_path_4o_cascade": "/output/cascade/nature/audio_4.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/nature/audio_4.wav",
85
+ "level": "L2"
86
+ }
87
+ ]
data/pause.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "pause_pause1",
4
+ "input_path": "/input/pause/pause1.wav",
5
+ "text": "重复一遍“我一把/把把/把住了”这句话",
6
+ "task": "Pause and segmentation",
7
+ "task_description": "Can the model accurately pause and segment in ambiguous cases?",
8
+ "output_path_4o": "/output/ChatGPT-4o/pause/pause1/pause1.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/pause/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/pause/pause1.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/pause/audio_0.wav",
12
+ "language": "Chinese",
13
+ "category": "Education",
14
+ "output_path_4o_cascade": "/output/cascade/pause/pause1.wav",
15
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pause/pause1.wav",
16
+ "level": "L3"
17
+ },
18
+ {
19
+ "id": "pause_pause2",
20
+ "input_path": "/input/pause/pause2.wav",
21
+ "text": "“我一把/把把/把住了”你知道是什么意思吗?",
22
+ "task": "Pause and segmentation",
23
+ "task_description": "Can the model accurately pause and segment in ambiguous cases?",
24
+ "output_path_4o": "/output/ChatGPT-4o/pause/pause2/pause2.wav",
25
+ "output_path_miniomni": "/output/Mini-Omni/pause/01.wav",
26
+ "output_path_speechgpt": "/output/SpeechGPT/pause/pause2.wav",
27
+ "output_path_funaudio": "/output/FunAudioLLM/pause/audio_1.wav",
28
+ "language": "Chinese",
29
+ "category": "Education",
30
+ "output_path_4o_cascade": "/output/cascade/pause/pause2.wav",
31
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pause/pause2.wav",
32
+ "level": "L1"
33
+ },
34
+ {
35
+ "id": "pause_pause3",
36
+ "input_path": "/input/pause/pause3.wav",
37
+ "text": "你知道下面这句话是什么意思吗?“昨天晚上小偷/偷偷/偷了我的电脑。”,并用明显的停顿重复一遍。",
38
+ "task": "Pause and segmentation",
39
+ "task_description": "Can the model accurately pause and segment in ambiguous cases?",
40
+ "output_path_4o": "/output/ChatGPT-4o/pause/pause3/pause3.wav",
41
+ "output_path_miniomni": "/output/Mini-Omni/pause/02.wav",
42
+ "output_path_speechgpt": "/output/SpeechGPT/pause/pause3.wav",
43
+ "output_path_funaudio": "/output/FunAudioLLM/pause/audio_2.wav",
44
+ "language": "Chinese",
45
+ "category": "Education",
46
+ "output_path_4o_cascade": "/output/cascade/pause/pause3.wav",
47
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pause/pause3.wav",
48
+ "level": "L3"
49
+ },
50
+ {
51
+ "id": "pause_pause4",
52
+ "input_path": "/input/pause/pause4.wav",
53
+ "text": "下面第一个句子还是第二个句子的停顿是正确的?“南京市/长江大桥欢迎您’和‘南京市长/江大桥欢迎您’”",
54
+ "task": "Pause and segmentation",
55
+ "task_description": "Can the model accurately pause and segment in ambiguous cases?",
56
+ "output_path_4o": "/output/ChatGPT-4o/pause/pause4/pause4.wav",
57
+ "output_path_miniomni": "/output/Mini-Omni/pause/03.wav",
58
+ "output_path_speechgpt": "/output/SpeechGPT/pause/pause4.wav",
59
+ "output_path_funaudio": "/output/FunAudioLLM/pause/audio_3.wav",
60
+ "language": "Chinese",
61
+ "category": "Education",
62
+ "output_path_4o_cascade": "/output/cascade/pause/pause4.wav",
63
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pause/pause4.wav",
64
+ "level": "L1"
65
+ }
66
+ ]
data/poetry.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "poetry_audio_0",
4
+ "input_path": "/input/poetry/audio_0.mp3",
5
+ "text": "Can you recite the poem? I don't remember it very well. (Shall I emm... to a summer's day? Thou art more lovely...)",
6
+ "task": "Poetry recitation",
7
+ "task_description": "Can the model recite poems?",
8
+ "output_path_4o": "/output/ChatGPT-4o/poetry/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/poetry/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/poetry/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/poetry/audio_0.wav",
12
+ "text_cn": "你能背诵这首诗吗?我不太记得它。(我可以...到夏天吗?你更可爱...)",
13
+ "language": "English",
14
+ "category": "Entertainment",
15
+ "output_path_4o_cascade": "/output/cascade/poetry/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/poetry/audio_0.wav",
17
+ "level": "L0"
18
+ },
19
+ {
20
+ "id": "poetry_audio_1",
21
+ "input_path": "/input/poetry/audio_1.mp3",
22
+ "text": "Please recite Robert Frost's \"The Road Not Taken\" and express the theme of choice and reflection.",
23
+ "task": "Poetry recitation",
24
+ "task_description": "Can the model recite poems?",
25
+ "output_path_4o": "/output/ChatGPT-4o/poetry/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/poetry/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/poetry/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/poetry/audio_1.wav",
29
+ "text_cn": "请背诵罗伯特·弗罗斯特(Robert Frost)的《未铺路》,并表达选择和反思的主题。",
30
+ "language": "English",
31
+ "category": "Entertainment",
32
+ "output_path_4o_cascade": "/output/cascade/poetry/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/poetry/audio_1.wav",
34
+ "level": "L0"
35
+ },
36
+ {
37
+ "id": "poetry_audio_2",
38
+ "input_path": "/input/poetry/audio_2.mp3",
39
+ "text": "Please recite William Wordsworth's \"I Wandered Lonely as a Cloud\" and convey the feeling of joy and serenity.",
40
+ "task": "Poetry recitation",
41
+ "task_description": "Can the model recite poems?",
42
+ "output_path_4o": "/output/ChatGPT-4o/poetry/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/poetry/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/poetry/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/poetry/audio_2.wav",
46
+ "text_cn": "请背诵威廉·华兹华斯(William Wordsworth)的“我孤独地漫步,因为云”,并传达了喜悦和宁静的感觉。",
47
+ "language": "English",
48
+ "category": "Entertainment",
49
+ "output_path_4o_cascade": "/output/cascade/poetry/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/poetry/audio_2.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "poetry_audio_3",
55
+ "input_path": "/input/poetry/audio_3.mp3",
56
+ "text": "Please recite \"Sonnet 18\" by William Shakespeare with emphasis on the admiration and eternal beauty described.",
57
+ "task": "Poetry recitation",
58
+ "task_description": "Can the model recite poems?",
59
+ "output_path_4o": "/output/ChatGPT-4o/poetry/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/poetry/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/poetry/answer_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/poetry/audio_3.wav",
63
+ "text_cn": "请朗诵威廉·莎士比亚(William Shakespeare)的“十四行诗18'”,重点是描述的钦佩和永恒之美。",
64
+ "language": "English",
65
+ "category": "Entertainment",
66
+ "output_path_4o_cascade": "/output/cascade/poetry/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/poetry/audio_3.wav",
68
+ "level": "L0"
69
+ }
70
+ ]
data/polyphone.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "polyphone_polyphone5",
4
+ "input_path": "/input/polyphone/polyphone5.wav",
5
+ "text": "Is there anything wrong with my pronunciation in the following sentence?: 'I will live (/lɪv/) to see the live (/lɪv/) performance tomorrow night.'",
6
+ "task": "Polyphonic word comprehension",
7
+ "task_description": "Can the model accurately understand polyphonic word?",
8
+ "output_path_4o": "/output/ChatGPT-4o/polyphone/polyphone5/polyphone5.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/polyphone/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/polyphone/polyphone5.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/polyphone/audio_4.wav",
12
+ "text_cn": "以下句子中的我的发音有问题吗?:我将活着观看明晚的现场表演。",
13
+ "language": "English",
14
+ "category": "Education",
15
+ "output_path_4o_cascade": "/output/cascade/polyphone/polyphone5.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/polyphone/polyphone5.wav",
17
+ "level": "L1"
18
+ },
19
+ {
20
+ "id": "polyphone_polyphone6",
21
+ "input_path": "/input/polyphone/polyphone6.wav",
22
+ "text": "Is there anything wrong with my pronunciation in the following sentence?: 'Try not to tear (/tɪər/) the paper, or it may bring a tear (/tɛər/) to your eye.'",
23
+ "task": "Polyphonic word comprehension",
24
+ "task_description": "Can the model accurately understand polyphonic word?",
25
+ "output_path_4o": "/output/ChatGPT-4o/polyphone/polyphone6/polyphone6.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/polyphone/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/polyphone/polyphone6.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/polyphone/audio_5.wav",
29
+ "text_cn": "以下句子中的我的发音有问题吗?:尽量不要撕纸张,否则可能会让你的眼睛流泪。",
30
+ "language": "English",
31
+ "category": "Education",
32
+ "output_path_4o_cascade": "/output/cascade/polyphone/polyphone6.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/polyphone/polyphone6.wav",
34
+ "level": "L1"
35
+ },
36
+ {
37
+ "id": "polyphone_audio_0",
38
+ "input_path": "/input/polyphone/audio_0.mp3",
39
+ "text": "下面这句话是什么意思:“我们要给行长送行。”",
40
+ "task": "Polyphonic word comprehension",
41
+ "task_description": "Can the model accurately understand polyphonic word?",
42
+ "output_path_4o": "/output/ChatGPT-4o/polyphone/audio_0/audio_0.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/polyphone/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/polyphone/audio_0.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/polyphone/audio_0.wav",
46
+ "language": "Chinese",
47
+ "category": "Education",
48
+ "output_path_4o_cascade": "/output/cascade/polyphone/audio_0.wav",
49
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/polyphone/audio_0.wav",
50
+ "level": "L1"
51
+ },
52
+ {
53
+ "id": "polyphone_audio_1",
54
+ "input_path": "/input/polyphone/audio_1.mp3",
55
+ "text": "下面这句话是什么意思:“他没参加校园活动,也没参加校正发音的活动。”",
56
+ "task": "Polyphonic word comprehension",
57
+ "task_description": "Can the model accurately understand polyphonic word?",
58
+ "output_path_4o": "/output/ChatGPT-4o/polyphone/audio_1/audio_1.wav",
59
+ "output_path_miniomni": "/output/Mini-Omni/polyphone/03.wav",
60
+ "output_path_speechgpt": "/output/SpeechGPT/polyphone/audio_1.wav",
61
+ "output_path_funaudio": "/output/FunAudioLLM/polyphone/audio_1.wav",
62
+ "language": "Chinese",
63
+ "category": "Education",
64
+ "output_path_4o_cascade": "/output/cascade/polyphone/audio_1.wav",
65
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/polyphone/audio_1.wav",
66
+ "level": "L1"
67
+ },
68
+ {
69
+ "id": "polyphone_audio_2",
70
+ "input_path": "/input/polyphone/audio_2.mp3",
71
+ "text": "下面这句话我的读音有误吗?“这个掌(长)期计划非常重要,而他的长(发)乏也很特别。”",
72
+ "task": "Polyphonic word comprehension",
73
+ "task_description": "Can the model accurately understand polyphonic word?",
74
+ "output_path_4o": "/output/ChatGPT-4o/polyphone/audio_2/audio_2.wav",
75
+ "output_path_miniomni": "/output/Mini-Omni/polyphone/04.wav",
76
+ "output_path_speechgpt": "/output/SpeechGPT/polyphone/audio_2.wav",
77
+ "output_path_funaudio": "/output/FunAudioLLM/polyphone/audio_2.wav",
78
+ "language": "Chinese",
79
+ "category": "Education",
80
+ "output_path_4o_cascade": "/output/cascade/polyphone/audio_2.wav",
81
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/polyphone/audio_2.wav",
82
+ "level": "L1"
83
+ },
84
+ {
85
+ "id": "polyphone_audio_3",
86
+ "input_path": "/input/polyphone/audio_3.mp3",
87
+ "text": "纠正我下面这句话的发音:“今天的月(乐)趣很多,我特别喜欢那首乐曲。”",
88
+ "task": "Polyphonic word comprehension",
89
+ "task_description": "Can the model accurately understand polyphonic word?",
90
+ "output_path_4o": "/output/ChatGPT-4o/polyphone/audio_3/audio_3.wav",
91
+ "output_path_miniomni": "/output/Mini-Omni/polyphone/05.wav",
92
+ "output_path_speechgpt": "/output/SpeechGPT/polyphone/audio_3.wav",
93
+ "output_path_funaudio": "/output/FunAudioLLM/polyphone/audio_3.wav",
94
+ "language": "Chinese",
95
+ "category": "Education",
96
+ "output_path_4o_cascade": "/output/cascade/polyphone/audio_3.wav",
97
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/polyphone/audio_3.wav",
98
+ "level": "L1"
99
+ }
100
+ ]
data/pronunciation.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "pronounciation_audio_0",
4
+ "input_path": "/input/pronounciation/audio_0.mp3",
5
+ "text": "Did I mispronounce the following sentence? \"My favorite place to go on weekends is the school liberry.\"",
6
+ "task": "Correcting pronunciation ability",
7
+ "task_description": "Can the model correct inaccurate pronunciations?",
8
+ "output_path_4o": "/output/ChatGPT-4o/pronounciation/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/pronounciation/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/pronounciation/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/pronounciation/audio_0.wav",
12
+ "text_cn": "我是否错误地说了以下句子?“周末我最喜欢去的地方是解放学校。",
13
+ "language": "English",
14
+ "category": "Education",
15
+ "output_path_4o_cascade": "/output/cascade/pronounciation/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pronounciation/audio_0.wav",
17
+ "level": "L1"
18
+ },
19
+ {
20
+ "id": "pronounciation_audio_1",
21
+ "input_path": "/input/pronounciation/audio_1.mp3",
22
+ "text": "When I say the word 'bed' to express 'not good', people often mishear it and think I've said something else. Can you tell me what's wrong with my pronunciation?",
23
+ "task": "Correcting pronunciation ability",
24
+ "task_description": "Can the model correct inaccurate pronunciations?",
25
+ "output_path_4o": "/output/ChatGPT-4o/pronounciation/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/pronounciation/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/pronounciation/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/pronounciation/audio_1.wav",
29
+ "text_cn": "当我说“床”一词来表达“不好”时,人们经常会乱七八糟,并认为我说了其他话。你能告诉我我的发音怎么了?",
30
+ "language": "English",
31
+ "category": "Education",
32
+ "output_path_4o_cascade": "/output/cascade/pronounciation/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pronounciation/audio_1.wav",
34
+ "level": "L1"
35
+ },
36
+ {
37
+ "id": "pronounciation_audio_2",
38
+ "input_path": "/input/pronounciation/audio_2.mp3",
39
+ "text": "When I say, \"I have to take a sheep back home every time I go on vacation,\" people always laugh at me. What is the reason they laugh?",
40
+ "task": "Correcting pronunciation ability",
41
+ "task_description": "Can the model correct inaccurate pronunciations?",
42
+ "output_path_4o": "/output/ChatGPT-4o/pronounciation/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/pronounciation/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/pronounciation/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/pronounciation/audio_2.wav",
46
+ "text_cn": "当我说:“每次度假时,我都必须把绵羊带回家,”人们总是嘲笑我。他们笑的原因是什么?",
47
+ "language": "English",
48
+ "category": "Education",
49
+ "output_path_4o_cascade": "/output/cascade/pronounciation/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/pronounciation/audio_2.wav",
51
+ "level": "L1"
52
+ },
53
+ {
54
+ "id": "pronounciation_pronounciation0_AirportAnnouncements_1",
55
+ "input_path": "/input/noise/pronounciation0_AirportAnnouncements_1.wav",
56
+ "text": "[Add AirportAnnouncements noise]Did I mispronounce the following sentence? \"My favorite place to go on weekends is the school liberry.\"",
57
+ "noise":"Add AirportAnnouncements noise",
58
+ "task": "Correcting pronunciation ability",
59
+ "task_description": "Can the model correct inaccurate pronunciations?",
60
+ "output_path_4o": "/output/ChatGPT-4o/noise/pronounciation0_AirportAnnouncements_1/pronounciation0_AirportAnnouncements_1.wav",
61
+ "output_path_miniomni": "/output/Mini-Omni/noise/04.wav",
62
+ "output_path_speechgpt": "/output/SpeechGPT/noise/pronounciation0_AirportAnnouncements_1.wav",
63
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_4.wav",
64
+ "text_cn": "我是否错误地说了以下句子?“周末我最喜欢去的地方是解放学校。",
65
+ "language": "English",
66
+ "category": "Education",
67
+ "output_path_4o_cascade": "/output/cascade/noise/pronounciation0_AirportAnnouncements_1.wav",
68
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/pronounciation0_AirportAnnouncements_1.wav",
69
+ "level": "L1"
70
+ },
71
+ {
72
+ "id": "pronounciation_pronounciation1_AirportAnnouncements_1",
73
+ "input_path": "/input/noise/pronounciation1_AirportAnnouncements_1.wav",
74
+ "text": "[Add AirportAnnouncements noise]When I say the word 'bed' to express 'not good', people often mishear it and think I've said something else. Can you tell me what's wrong with my pronunciation?",
75
+ "noise":"Add AirportAnnouncements noise",
76
+ "task": "Correcting pronunciation ability",
77
+ "task_description": "Can the model correct inaccurate pronunciations?",
78
+ "output_path_4o": "/output/ChatGPT-4o/noise/pronounciation1_AirportAnnouncements_1/pronounciation1_AirportAnnouncements_1.wav",
79
+ "output_path_miniomni": "/output/Mini-Omni/noise/05.wav",
80
+ "output_path_speechgpt": "/output/SpeechGPT/noise/pronounciation1_AirportAnnouncements_1.wav",
81
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_5.wav",
82
+ "text_cn": "当我说“床”一词来表达“不好”时,人们经常会乱七八糟,并认为我说了其他话。你能告诉我我的发音怎么了?",
83
+ "language": "English",
84
+ "category": "Education",
85
+ "output_path_4o_cascade": "/output/cascade/noise/pronounciation1_AirportAnnouncements_1.wav",
86
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/pronounciation1_AirportAnnouncements_1.wav",
87
+ "level": "L1"
88
+ }
89
+ ]
data/psychological.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "psychological_audio_0",
4
+ "input_path": "/input/psychological/audio_0.mp3",
5
+ "text": "I'm not feeling well lately. The work pressure is overwhelming, and I feel like I can't handle it anymore.",
6
+ "task": "Psychological comfort",
7
+ "task_description": "Can the model provide comforting psychological support?",
8
+ "output_path_4o": "/output/ChatGPT-4o/psychological/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/psychological/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/psychological/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/psychological/audio_0.wav",
12
+ "text_cn": "我最近感觉不舒服。工作压力是压倒性的,我觉得我无法应付。",
13
+ "language": "English",
14
+ "category": "Medical Consultation",
15
+ "output_path_4o_cascade": "/output/cascade/psychological/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/psychological/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "psychological_audio_1",
21
+ "input_path": "/input/psychological/audio_1.mp3",
22
+ "text": "I was troubled all night because I had a fight with my best friend. We said some really hurtful things to each other. I don't know what to do now.",
23
+ "task": "Psychological comfort",
24
+ "task_description": "Can the model provide comforting psychological support?",
25
+ "output_path_4o": "/output/ChatGPT-4o/psychological/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/psychological/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/psychological/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/psychological/audio_1.wav",
29
+ "text_cn": "我整夜都感到烦恼,因为我和我最好的朋友打架。我们说了一些非常有害的事情。我不知道该怎么办。",
30
+ "language": "English",
31
+ "category": "Medical Consultation",
32
+ "output_path_4o_cascade": "/output/cascade/psychological/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/psychological/audio_1.wav",
34
+ "level": "L2"
35
+ },
36
+ {
37
+ "id": "psychological_audio_2",
38
+ "input_path": "/input/psychological/audio_2.mp3",
39
+ "text": "I recently lost my job, and I feel very uncertain about the future. I don't know if there's any hope left for my career.",
40
+ "task": "Psychological comfort",
41
+ "task_description": "Can the model provide comforting psychological support?",
42
+ "output_path_4o": "/output/ChatGPT-4o/psychological/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/psychological/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/psychological/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/psychological/audio_2.wav",
46
+ "text_cn": "我最近失去了工作,对未来感到非常不确定。我不知道我的职业生涯是否有希望。",
47
+ "language": "English",
48
+ "category": "Medical Consultation",
49
+ "output_path_4o_cascade": "/output/cascade/psychological/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/psychological/audio_2.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "psychological_audio_3",
55
+ "input_path": "/input/psychological/audio_3.mp3",
56
+ "text": "I can't stop dwelling on a recent failure. It feels like I can't do anything right.",
57
+ "task": "Psychological comfort",
58
+ "task_description": "Can the model provide comforting psychological support?",
59
+ "output_path_4o": "/output/ChatGPT-4o/psychological/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/psychological/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/psychological/answer_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/psychological/audio_3.wav",
63
+ "text_cn": "我不能停止居住最近的失败。感觉我不能做对的事情。",
64
+ "language": "English",
65
+ "category": "Medical Consultation",
66
+ "output_path_4o_cascade": "/output/cascade/psychological/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/psychological/audio_3.wav",
68
+ "level": "L2"
69
+ },
70
+ {
71
+ "id": "psychological_audio_4",
72
+ "input_path": "/input/psychological/audio_4.mp3",
73
+ "text": "I think I might be a bit depressed lately. I find it hard to get out of bed in the morning and have no energy throughout the day.",
74
+ "task": "Psychological comfort",
75
+ "task_description": "Can the model provide comforting psychological support?",
76
+ "output_path_4o": "/output/ChatGPT-4o/psychological/audio_4/audio_4.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/psychological/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/psychological/answer_4.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/psychological/audio_4.wav",
80
+ "text_cn": "我想我最近可能会有些沮丧。我发现早上很难起床,整天没有精力。",
81
+ "language": "English",
82
+ "category": "Medical Consultation",
83
+ "output_path_4o_cascade": "/output/cascade/psychological/audio_4.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/psychological/audio_4.wav",
85
+ "level": "L2"
86
+ }
87
+ ]
data/rhythm.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "rhythm_audio_0",
4
+ "input_path": "/input/rhythm/audio_0.mp3",
5
+ "text": "Say the following sentence very slowly: \"The quick brown fox jumps over the lazy dog.\"",
6
+ "task": "Rhythm control capabilities",
7
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
8
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/audio_0.wav",
12
+ "text_cn": "非常缓慢地说以下句子:'快速的棕狐跳过了懒狗。",
13
+ "language": "English",
14
+ "category": "Education",
15
+ "output_path_4o_cascade": "/output/cascade/rhythm/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "rhythm_audio_3",
21
+ "input_path": "/input/rhythm/audio_3.mp3",
22
+ "text": "Say the first half of this sentence quickly and the second half slowly: \"In a world full of challenges, we need to find creative solutions to make things better.\"",
23
+ "task": "Rhythm control capabilities",
24
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
25
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/audio_3/audio_3.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/answer_3.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/audio_1.wav",
29
+ "text_cn": "在这句话的上半年中迅速和下半部分说:“在一个充满挑战的世界中,我们需要找到创造性的解决方案来使事情变得更好。",
30
+ "language": "English",
31
+ "category": "Education",
32
+ "output_path_4o_cascade": "/output/cascade/rhythm/audio_3.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/audio_3.wav",
34
+ "level": "L2"
35
+ },
36
+ {
37
+ "id": "rhythm_audio_7",
38
+ "input_path": "/input/rhythm/audio_7.mp3",
39
+ "text": "First, say this sentence as fast as you can, then repeat it very slowly: \"The rain in Spain falls mainly in the plain.\"",
40
+ "task": "Rhythm control capabilities",
41
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
42
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/audio_7/audio_7.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/answer_7.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/audio_2.wav",
46
+ "text_cn": "首先,尽可能快地说这句话,然后非常缓慢地重复:'西班牙的降雨主要落在平原上。",
47
+ "language": "English",
48
+ "category": "Education",
49
+ "output_path_4o_cascade": "/output/cascade/rhythm/audio_7.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/audio_7.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "rhythm_audio_18",
55
+ "input_path": "/input/rhythm/audio_18.mp3",
56
+ "text": "Repeat the phrase 'Practice makes perfect' three times, each time at a different speed: first very slowly, then moderately, and finally very fast.",
57
+ "task": "Rhythm control capabilities",
58
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
59
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/audio_18/audio_18.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/answer_18.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/audio_3.wav",
63
+ "text_cn": "每次以不同的速度重复三次“练习完美”一词:首先非常缓慢,然后适度,最后非常快。",
64
+ "language": "English",
65
+ "category": "Education",
66
+ "output_path_4o_cascade": "/output/cascade/rhythm/audio_18.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/audio_18.wav",
68
+ "level": "L2"
69
+ },
70
+ {
71
+ "id": "rhythm_audio_22",
72
+ "input_path": "/input/rhythm/audio_22.mp3",
73
+ "text": "Say the following sentence as if you're whispering: \"This is a secret, you can't tell anyone.\"",
74
+ "task": "Rhythm control capabilities",
75
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
76
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/audio_22/audio_22.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/answer_22.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/audio_4.wav",
80
+ "text_cn": "说以下句子,好像您在窃窃私语:“这是一个秘密,您不能告诉任何人。",
81
+ "language": "English",
82
+ "category": "Education",
83
+ "output_path_4o_cascade": "/output/cascade/rhythm/audio_22.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/audio_22.wav",
85
+ "level": "L2"
86
+ },
87
+ {
88
+ "id": "rhythm_audio_23",
89
+ "input_path": "/input/rhythm/audio_23.mp3",
90
+ "text": "Repeat this sentence very loudly as if you are trying to get someone's attention: \"Watch out! There's something coming your way!\"",
91
+ "task": "Rhythm control capabilities",
92
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
93
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/audio_23/audio_23.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/05.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/answer_23.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/audio_5.wav",
97
+ "text_cn": "大声重复这句话,好像您想引起某人的注意:“当心!有一些事情!",
98
+ "language": "English",
99
+ "category": "Education",
100
+ "output_path_4o_cascade": "/output/cascade/rhythm/audio_23.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/audio_23.wav",
102
+ "level": "L2"
103
+ },
104
+ {
105
+ "id": "rhythm_input_audio_3",
106
+ "input_path": "/input/rhythm/input_audio_3.wav",
107
+ "text": "Say the following sentence at my speed first, then say it again very slowly: \"Artificial intelligence is changing the world in many ways.\"",
108
+ "task": "Rhythm control capabilities",
109
+ "task_description": "Can the model adjust the output pace, speaking faster or slower as required?",
110
+ "output_path_4o": "/output/ChatGPT-4o/rhythm/4o_audio_3.wav",
111
+ "output_path_miniomni": "/output/Mini-Omni/rhythm/mini-omni_03.wav",
112
+ "output_path_speechgpt": "/output/SpeechGPT/rhythm/SpeechGPT_answer_3.wav",
113
+ "output_path_funaudio": "/output/FunAudioLLM/rhythm/FunAudio_audio_3.1.wav",
114
+ "text_cn": "先按我的速度说出下面这句话,然后再慢慢地说一遍:“人工智能正在以多种方式改变世界。",
115
+ "language": "English",
116
+ "category": "Education",
117
+ "output_path_4o_cascade": "/output/cascade/rhythm/input_audio_3.wav",
118
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/rhythm/input_audio_3.wav",
119
+ "level": "L3"
120
+ }
121
+ ]
data/role.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "role_audio_0",
4
+ "input_path": "/input/role/audio_0.mp3",
5
+ "text": "Imitate my voice and accent and say something",
6
+ "task": "Role-playing",
7
+ "task_description": "Can the model simulate a character with specific age, gender, accent, and voice tone?",
8
+ "output_path_4o": "/output/ChatGPT-4o/role/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/role/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/role/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/role/audio_0.wav",
12
+ "text_cn": "模仿我的声音和口音,说些什么",
13
+ "language": "English",
14
+ "category": "Entertainment",
15
+ "output_path_4o_cascade": "/output/cascade/role/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/role/audio_0.wav",
17
+ "level": "L3"
18
+ },
19
+ {
20
+ "id": "role_audio_1",
21
+ "input_path": "/input/role/audio_1.mp3",
22
+ "text": "Imitate my age say something",
23
+ "task": "Role-playing",
24
+ "task_description": "Can the model simulate a character with specific age, gender, accent, and voice tone?",
25
+ "output_path_4o": "/output/ChatGPT-4o/role/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/role/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/role/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/role/audio_1.wav",
29
+ "text_cn": "模仿我的年龄说些什么",
30
+ "language": "English",
31
+ "category": "Entertainment",
32
+ "output_path_4o_cascade": "/output/cascade/role/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/role/audio_1.wav",
34
+ "level": "L3"
35
+ },
36
+ {
37
+ "id": "role_role0_VacuumCleaner_1",
38
+ "input_path": "/input/noise/role0_VacuumCleaner_1.wav",
39
+ "text": "[Add VacuumCleaner noise]Imitate my voice and accent and say something",
40
+ "noise":"Add VacuumCleaner noise",
41
+ "task": "Role-playing",
42
+ "task_description": "Can the model simulate a character with specific age, gender, accent, and voice tone?",
43
+ "output_path_4o": "/output/ChatGPT-4o/noise/role0_VacuumCleaner_1/role0_VacuumCleaner_1.wav",
44
+ "output_path_miniomni": "/output/Mini-Omni/noise/08.wav",
45
+ "output_path_speechgpt": "/output/SpeechGPT/noise/role0_VacuumCleaner_1.wav",
46
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_8.wav",
47
+ "text_cn": "模仿我的声音和口音,说些什么",
48
+ "language": "English",
49
+ "category": "Entertainment",
50
+ "output_path_4o_cascade": "/output/cascade/noise/role0_VacuumCleaner_1.wav",
51
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/role0_VacuumCleaner_1.wav",
52
+ "level": "L3"
53
+ },
54
+ {
55
+ "id": "role_role1_VacuumCleaner_1",
56
+ "input_path": "/input/noise/role1_VacuumCleaner_1.wav",
57
+ "text": "[Add VacuumCleaner noise]Imitate my age say something",
58
+ "noise":"Add VacuumCleaner noise",
59
+ "task": "Role-playing",
60
+ "task_description": "Can the model simulate a character with specific age, gender, accent, and voice tone?",
61
+ "output_path_4o": "/output/ChatGPT-4o/noise/role1_VacuumCleaner_1/role1_VacuumCleaner_1.wav",
62
+ "output_path_miniomni": "/output/Mini-Omni/noise/09.wav",
63
+ "output_path_speechgpt": "/output/SpeechGPT/noise/role1_VacuumCleaner_1.wav",
64
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_9.wav",
65
+ "text_cn": "模仿我的年龄说些什么",
66
+ "language": "English",
67
+ "category": "Entertainment",
68
+ "output_path_4o_cascade": "/output/cascade/noise/role1_VacuumCleaner_1.wav",
69
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/role1_VacuumCleaner_1.wav",
70
+ "level": "L3"
71
+ }
72
+ ]
data/singing.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "singing_audio_0",
4
+ "input_path": "/input/sing/audio_0.mp3",
5
+ "text": "Please sing the first verse and chorus of a popular children's song like \"Twinkle, Twinkle, Little Star.\"",
6
+ "task": "Singing ability",
7
+ "task_description": "Can the model sing a song upon request?",
8
+ "output_path_4o": "/output/ChatGPT-4o/sing/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/sing/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/sing/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_0.wav",
12
+ "text_cn": "请演唱流行儿童歌曲的第一节经文和合唱,例如“ Twinkle,Twinkle,Little Star”。",
13
+ "language": "English",
14
+ "category": "Entertainment",
15
+ "output_path_4o_cascade": "/output/cascade/singing/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "singing_audio_11",
21
+ "input_path": "/input/sing/audio_11.mp3",
22
+ "text": "Sing a famous song, like \"Row, Row, Row Your Boat\", but sing it in a higher pitch than usual.",
23
+ "task": "Singing ability",
24
+ "task_description": "Can the model sing a song upon request?",
25
+ "output_path_4o": "/output/ChatGPT-4o/sing/audio_11/audio_11.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/sing/11.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/sing/answer_11.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_4.wav",
29
+ "text_cn": "唱着著名的歌曲,例如“ Row,Row,Row Tour Your Boat”,但在比平时更高的音调唱歌。",
30
+ "language": "English",
31
+ "category": "Entertainment",
32
+ "output_path_4o_cascade": "/output/cascade/singing/audio_11.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/audio_11.wav",
34
+ "level": "L2"
35
+ },
36
+ {
37
+ "id": "singing_audio_20",
38
+ "input_path": "/input/sing/audio_20.mp3",
39
+ "text": "Sing the main melody of \"Twinkle, Twinkle, Little Star\" and harmonize with a second voice.",
40
+ "task": "Singing ability",
41
+ "task_description": "Can the model sing a song upon request?",
42
+ "output_path_4o": "/output/ChatGPT-4o/sing/audio_20/audio_20.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/sing/20.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/sing/answer_20.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_5.wav",
46
+ "text_cn": "唱着“闪烁,闪烁,小星星”的主要旋律,并以第二个声音和谐。",
47
+ "language": "English",
48
+ "category": "Entertainment",
49
+ "output_path_4o_cascade": "/output/cascade/singing/audio_20.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/audio_20.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "singing_sing1",
55
+ "input_path": "/input/sing/sing1.wav",
56
+ "text": "(Twinkle twinkle little star, How I wonder what you are), How do you sing the next two lines of this lyric?",
57
+ "task": "Singing ability",
58
+ "task_description": "Can the model sing a song upon request?",
59
+ "output_path_4o": "/output/ChatGPT-4o/sing/sing1/sing1.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/sing/21.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/sing/sing1.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_6.wav",
63
+ "text_cn": "(一闪一闪小星星,我多么想知道你是什么),这句歌词的接下来两句怎么唱?",
64
+ "language": "English",
65
+ "category": "Entertainment",
66
+ "output_path_4o_cascade": "/output/cascade/singing/sing1.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/sing1.wav",
68
+ "level": "L2"
69
+ },
70
+ {
71
+ "id": "singing_sing2",
72
+ "input_path": "/input/sing/sing2.wav",
73
+ "text": "(Hum but don't sing)(Twinkle twinkle little star, How I wonder what you are), Do you know this song? Can you sing it?",
74
+ "task": "Singing ability",
75
+ "task_description": "Can the model sing a song upon request?",
76
+ "output_path_4o": "/output/ChatGPT-4o/sing/sing2/sing2.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/sing/22.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/sing/sing2.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_7.wav",
80
+ "text_cn": "(只哼不唱)(一闪一闪小星星,我好想知道你是什么),你知道这首歌吗?你会唱吗?",
81
+ "language": "English",
82
+ "category": "Entertainment",
83
+ "output_path_4o_cascade": "/output/cascade/singing/sing2.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/sing2.wav",
85
+ "level": "L3"
86
+ },
87
+ {
88
+ "id": "singing_sing3",
89
+ "input_path": "/input/sing/sing3.wav",
90
+ "text": "(Singing out of tune)(happy birthday to you, happy birthday to you), Did I just sing out of tune? If so, what's the correct way to sing it?",
91
+ "task": "Singing ability",
92
+ "task_description": "Can the model sing a song upon request?",
93
+ "output_path_4o": "/output/ChatGPT-4o/sing/sing3/sing3.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/sing/23.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/sing/sing3.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_8.wav",
97
+ "text_cn": "(唱歌跑调)(祝你生日快乐,祝你生日快乐),我刚才唱歌跑调了吗?如果是这样,正确的唱法是什么?",
98
+ "language": "English",
99
+ "category": "Entertainment",
100
+ "output_path_4o_cascade": "/output/cascade/singing/sing3.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/sing3.wav",
102
+ "level": "L3"
103
+ },
104
+ {
105
+ "id": "singing_sing4",
106
+ "input_path": "/input/sing/sing4.wav",
107
+ "text": "(happy birthday to you, happy birthday to you),Can you sing the Chinese version of this song?",
108
+ "task": "Singing ability",
109
+ "task_description": "Can the model sing a song upon request?",
110
+ "output_path_4o": "/output/ChatGPT-4o/sing/sing4/sing4.wav",
111
+ "output_path_miniomni": "/output/Mini-Omni/sing/24.wav",
112
+ "output_path_speechgpt": "/output/SpeechGPT/sing/sing4.wav",
113
+ "output_path_funaudio": "/output/FunAudioLLM/sing/audio_9.wav",
114
+ "text_cn": "(祝你生日快乐,祝你生日快乐)你能唱这首歌的中文版吗?",
115
+ "language": "English",
116
+ "category": "Entertainment",
117
+ "output_path_4o_cascade": "/output/cascade/singing/sing4.wav",
118
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/singing/sing4.wav",
119
+ "level": "L2"
120
+ }
121
+ ]
data/story.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "story_audio_0",
4
+ "input_path": "/input/story/audio_0.mp3",
5
+ "text": "Tell a sad story, such as the journey of a little kitten being abandoned and later adopted by a kind person.",
6
+ "task": "Storytelling",
7
+ "task_description": "Can the model narrate a story with emotional depth?",
8
+ "output_path_4o": "/output/ChatGPT-4o/story/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/story/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/story/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/story/audio_0.wav",
12
+ "text_cn": "讲一个悲伤的故事,例如一只小猫被抛弃并后来被一个善良的人采用的旅程。",
13
+ "language": "English",
14
+ "category": "Entertainment",
15
+ "output_path_4o_cascade": "/output/cascade/story/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/story/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "story_audio_1",
21
+ "input_path": "/input/story/audio_1.mp3",
22
+ "text": "Begin a mysterious story set in an ancient, abandoned castle.",
23
+ "task": "Storytelling",
24
+ "task_description": "Can the model narrate a story with emotional depth?",
25
+ "output_path_4o": "/output/ChatGPT-4o/story/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/story/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/story/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/story/audio_1.wav",
29
+ "text_cn": "在一个古老的废弃城堡中开始一个神秘的故事。",
30
+ "language": "English",
31
+ "category": "Entertainment",
32
+ "output_path_4o_cascade": "/output/cascade/story/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/story/audio_1.wav",
34
+ "level": "L2"
35
+ },
36
+ {
37
+ "id": "story_audio_2",
38
+ "input_path": "/input/story/audio_2.mp3",
39
+ "text": "Describe a story where a little boy discovers a lost puppy in the snow, detailing their first meeting.",
40
+ "task": "Storytelling",
41
+ "task_description": "Can the model narrate a story with emotional depth?",
42
+ "output_path_4o": "/output/ChatGPT-4o/story/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/story/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/story/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/story/audio_2.wav",
46
+ "text_cn": "描述一个故事,一个小男孩在雪中发现一只迷失的小狗,详细介绍了他们的第一次见面。",
47
+ "language": "English",
48
+ "category": "Entertainment",
49
+ "output_path_4o_cascade": "/output/cascade/story/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/story/audio_2.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "story_story0_CopyMachine_1",
55
+ "input_path": "/input/noise/story0_CopyMachine_1.wav",
56
+ "text": "[Add CopyMachine noise]Tell a sad story, such as the journey of a little kitten being abandoned and later adopted by a kind person.",
57
+ "noise":"Add CopyMachine noise",
58
+ "task": "Storytelling",
59
+ "task_description": "Can the model narrate a story with emotional depth?",
60
+ "output_path_4o": "/output/ChatGPT-4o/noise/story0_CopyMachine_1/story0_CopyMachine_1.wav",
61
+ "output_path_miniomni": "/output/Mini-Omni/noise/10.wav",
62
+ "output_path_speechgpt": "/output/SpeechGPT/noise/story0_CopyMachine_1.wav",
63
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_10.wav",
64
+ "text_cn": "讲一个悲伤的故事,例如一只小猫被抛弃并后来被一个善良的人采用的旅程。",
65
+ "language": "English",
66
+ "category": "Entertainment",
67
+ "output_path_4o_cascade": "/output/cascade/noise/story0_CopyMachine_1.wav",
68
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/story0_CopyMachine_1.wav",
69
+ "level": "L2"
70
+ },
71
+ {
72
+ "id": "story_story1_CopyMachine_1",
73
+ "input_path": "/input/noise/story1_CopyMachine_1.wav",
74
+ "text": "[Add CopyMachine noise]Begin a mysterious story set in an ancient, abandoned castle.",
75
+ "noise":"Add CopyMachine noise",
76
+ "task": "Storytelling",
77
+ "task_description": "Can the model narrate a story with emotional depth?",
78
+ "output_path_4o": "/output/ChatGPT-4o/noise/story1_CopyMachine_1/story1_CopyMachine_1.wav",
79
+ "output_path_miniomni": "/output/Mini-Omni/noise/11.wav",
80
+ "output_path_speechgpt": "/output/SpeechGPT/noise/story1_CopyMachine_1.wav",
81
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_11.wav",
82
+ "text_cn": "在一个古老的废弃城堡中开始一个神秘的故事。",
83
+ "language": "English",
84
+ "category": "Entertainment",
85
+ "output_path_4o_cascade": "/output/cascade/noise/story1_CopyMachine_1.wav",
86
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/story1_CopyMachine_1.wav",
87
+ "level": "L2"
88
+ }
89
+ ]
data/stress.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "stress_stress1",
4
+ "input_path": "/input/stress/stress1.wav",
5
+ "text": "What is the difference between these two sentences? \"The timeline for completing this project (emphasized) is very tight, and everyone needs to work harder.\" and \"The timeline for completing this project is very tight, and everyone (emphasized) needs to work harder.\"",
6
+ "task": "Emphasis control",
7
+ "task_description": "Can the model understand stress emphasis and emphasize specific content with the right stress?",
8
+ "output_path_4o": "/output/ChatGPT-4o/stress/stress1/stress1.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/stress/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/stress/stress1.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/stress/audio_0.wav",
12
+ "text_cn": "这两句话有什么区别?‘完成这个项目的时间非常紧迫(强调),每个人都需要更加努力。’和‘完成这个项目的时间非常紧迫,每个人都需要更加努力。’",
13
+ "language": "English",
14
+ "category": "Education",
15
+ "output_path_4o_cascade": "/output/cascade/stress/stress1.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/stress/stress1.wav",
17
+ "level": "L1"
18
+ },
19
+ {
20
+ "id": "stress_stress2",
21
+ "input_path": "/input/stress/stress2.wav",
22
+ "text": "\"I really like going to the park on weekends(emphasized).\" What am I trying to emphasize in this sentence?",
23
+ "task": "Emphasis control",
24
+ "task_description": "Can the model understand stress emphasis and emphasize specific content with the right stress?",
25
+ "output_path_4o": "/output/ChatGPT-4o/stress/stress2/stress2.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/stress/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/stress/stress2.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/stress/audio_1.wav",
29
+ "text_cn": "“我真的很喜欢周末去公园(强调)。”我在这句话中想要强调什么?",
30
+ "language": "English",
31
+ "category": "Education",
32
+ "output_path_4o_cascade": "/output/cascade/stress/stress2.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/stress/stress2.wav",
34
+ "level": "L1"
35
+ },
36
+ {
37
+ "id": "stress_stress3",
38
+ "input_path": "/input/stress/stress3.wav",
39
+ "text": "Change the emphasis in the following sentence: \"I really like going to the park (emphasized) on weekends.\"",
40
+ "task": "Emphasis control",
41
+ "task_description": "Can the model understand stress emphasis and emphasize specific content with the right stress?",
42
+ "output_path_4o": "/output/ChatGPT-4o/stress/stress3/stress3.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/stress/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/stress/stress3.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/stress/audio_2.wav",
46
+ "text_cn": "改变以下句子中的重音:“我真的很喜欢周末去公园(强调)。”",
47
+ "language": "English",
48
+ "category": "Education",
49
+ "output_path_4o_cascade": "/output/cascade/stress/stress3.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/stress/stress3.wav",
51
+ "level": "L3"
52
+ },
53
+ {
54
+ "id": "stress_stress1_Munching_1",
55
+ "input_path": "/input/noise/stress1_Munching_1.wav",
56
+ "text": "[Add Munching noise]What is the difference between these two sentences? \"The timeline for completing this project (emphasized) is very tight, and everyone needs to work harder.\" and \"The timeline for completing this project is very tight, and everyone (emphasized) needs to work harder.\"",
57
+ "noise":"Add Munching noise",
58
+ "task": "Emphasis control",
59
+ "task_description": "Can the model understand stress emphasis and emphasize specific content with the right stress?",
60
+ "output_path_4o": "/output/ChatGPT-4o/noise/stress1_Munching_1/stress1_Munching_1.wav",
61
+ "output_path_miniomni": "/output/Mini-Omni/noise/12.wav",
62
+ "output_path_speechgpt": "/output/SpeechGPT/noise/stress1_Munching_1.wav",
63
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_12.wav",
64
+ "text_cn": "这两句话有什么区别?‘完成这个项目的时间非常紧迫(强调),每个人都需要更加努力。’和‘完成这个项目的时间非常紧迫,每个人都需要更加努力。’",
65
+ "language": "English",
66
+ "category": "Education",
67
+ "output_path_4o_cascade": "/output/cascade/noise/stress1_Munching_1.wav",
68
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/stress1_Munching_1.wav",
69
+ "level": "L1"
70
+ },
71
+ {
72
+ "id": "stress_stress2_Munching_1",
73
+ "input_path": "/input/noise/stress2_Munching_1.wav",
74
+ "text": "[Add Munching noise]\"I really like going to the park on weekends(emphasized).\" What am I trying to emphasize in this sentence?",
75
+ "noise":"Add Munching noise",
76
+ "task": "Emphasis control",
77
+ "task_description": "Can the model understand stress emphasis and emphasize specific content with the right stress?",
78
+ "output_path_4o": "/output/ChatGPT-4o/noise/stress2_Munching_1/stress2_Munching_1.wav",
79
+ "output_path_miniomni": "/output/Mini-Omni/noise/13.wav",
80
+ "output_path_speechgpt": "/output/SpeechGPT/noise/stress2_Munching_1.wav",
81
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_13.wav",
82
+ "text_cn": "“我真的很喜欢周末去公园(强调)。”我在这句话中想要强调什么?",
83
+ "language": "English",
84
+ "category": "Education",
85
+ "output_path_4o_cascade": "/output/cascade/noise/stress2_Munching_1.wav",
86
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/stress2_Munching_1.wav",
87
+ "level": "L1"
88
+ }
89
+ ]
data/tongue.json ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "tongue_twisters_audio_0",
4
+ "input_path": "/input/tongue_twister/audio_0.mp3",
5
+ "text": "Say the following sentence clearly: \"She sells seashells by the seashore.\"",
6
+ "task": "Tongue twisters capabilities",
7
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
8
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_0.wav",
12
+ "text_cn": "清楚地说:“她在海滨出售贝壳。”",
13
+ "language": "English",
14
+ "category": "Entertainment",
15
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "tongue_twisters_audio_2",
21
+ "input_path": "/input/tongue_twister/audio_2.mp3",
22
+ "text": "Say this sentence clearly without any errors: \"Betty bought a bit of butter, but the butter Betty bought was bitter.\"",
23
+ "task": "Tongue twisters capabilities",
24
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
25
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_2/audio_2.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/02.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_2.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_2.wav",
29
+ "text_cn": "清楚地说这句话没有任何错误:“贝蒂买了一点黄油,但贝蒂买的黄油却很痛苦。”",
30
+ "language": "English",
31
+ "category": "Entertainment",
32
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_2.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_2.wav",
34
+ "level": "L2"
35
+ },
36
+ {
37
+ "id": "tongue_twisters_audio_3",
38
+ "input_path": "/input/tongue_twister/audio_3.mp3",
39
+ "text": "Please say this tongue twister carefully: The sixth sick sheik's sixth sheep's sick.",
40
+ "task": "Tongue twisters capabilities",
41
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
42
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_3/audio_3.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/03.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_3.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_3.wav",
46
+ "text_cn": "请仔细地说出这种舌头的扭曲:第六个病酋长的第六只绵羊病了。",
47
+ "language": "English",
48
+ "category": "Entertainment",
49
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_3.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_3.wav",
51
+ "level": "L2"
52
+ },
53
+ {
54
+ "id": "tongue_twisters_audio_4",
55
+ "input_path": "/input/tongue_twister/audio_4.mp3",
56
+ "text": "Say the following clearly and at a regular pace: \"How can a clam cram in a clean cream can?\"",
57
+ "task": "Tongue twisters capabilities",
58
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
59
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_4/audio_4.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/04.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_4.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_4.wav",
63
+ "text_cn": "清楚地说出以下速度:蛤lam塞在干净的奶油罐中如何?",
64
+ "language": "English",
65
+ "category": "Entertainment",
66
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_4.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_4.wav",
68
+ "level": "L2"
69
+ },
70
+ {
71
+ "id": "tongue_twisters_audio_7",
72
+ "input_path": "/input/tongue_twister/audio_7.mp3",
73
+ "text": "Say this sentence quickly without losing clarity: \"A box of mixed biscuits, a mixed biscuit box.\"",
74
+ "task": "Tongue twisters capabilities",
75
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
76
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_7/audio_7.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/07.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_7.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_5.wav",
80
+ "text_cn": "迅速说出这句话而不会失去清晰度:一盒混合饼干,一个混合的饼干盒。",
81
+ "language": "English",
82
+ "category": "Entertainment",
83
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_7.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_7.wav",
85
+ "level": "L2"
86
+ },
87
+ {
88
+ "id": "tongue_twisters_audio_9",
89
+ "input_path": "/input/tongue_twister/audio_9.mp3",
90
+ "text": "Say this sentence with proper intonation: \"He thrusts his fists against the posts and still insists he sees the ghosts.\"",
91
+ "task": "Tongue twisters capabilities",
92
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
93
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_9/audio_9.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/09.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_9.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_6.wav",
97
+ "text_cn": "用适当的语调说出这句话:他将拳头推向哨所,仍然坚持认为他看到鬼魂。",
98
+ "language": "English",
99
+ "category": "Entertainment",
100
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_9.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_9.wav",
102
+ "level": "L2"
103
+ },
104
+ {
105
+ "id": "tongue_twisters_audio_10",
106
+ "input_path": "/input/tongue_twister/audio_10.mp3",
107
+ "text": "Say this sentence with clear emphasis on alliteration: \"Fred fed Ted bread and Ted fed Fred bread.\"",
108
+ "task": "Tongue twisters capabilities",
109
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
110
+ "output_path_4o": "/output/ChatGPT-4o/tongue_twister/audio_10/audio_10.wav",
111
+ "output_path_miniomni": "/output/Mini-Omni/tongue_twister/10.wav",
112
+ "output_path_speechgpt": "/output/SpeechGPT/tongue_twister/answer_10.wav",
113
+ "output_path_funaudio": "/output/FunAudioLLM/tongue_twister/audio_1.wav",
114
+ "text_cn": "说出这句话,以明确的重视:弗雷德喂了塞德面包和泰德喂了弗雷德面包。",
115
+ "language": "English",
116
+ "category": "Entertainment",
117
+ "output_path_4o_cascade": "/output/cascade/tongue_twister/audio_10.wav",
118
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/tongue_twister/audio_10.wav",
119
+ "level": "L2"
120
+ },
121
+ {
122
+ "id": "tongue_tongue_twister0_Neighbor_1",
123
+ "input_path": "/input/noise/tongue_twister0_Neighbor_1.wav",
124
+ "text": "[Add Neighbor noise]Say the following sentence clearly: \"She sells seashells by the seashore.\"",
125
+ "noise":"Add Neighbor noise",
126
+ "task": "Tongue twisters capabilities",
127
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
128
+ "output_path_4o": "/output/ChatGPT-4o/noise/tongue_twister0_Neighbor_1/tongue_twister0_Neighbor_1.wav",
129
+ "output_path_miniomni": "/output/Mini-Omni/noise/14.wav",
130
+ "output_path_speechgpt": "/output/SpeechGPT/noise/tongue_twister0_Neighbor_1.wav",
131
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_14.wav",
132
+ "text_cn": "清楚地说:“她在海滨出售贝壳。”",
133
+ "language": "English",
134
+ "category": "Entertainment",
135
+ "output_path_4o_cascade": "/output/cascade/noise/tongue_twister0_Neighbor_1.wav",
136
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/tongue_twister0_Neighbor_1.wav",
137
+ "level": "L2"
138
+ },
139
+ {
140
+ "id": "tongue_tongue_twister2_Neighbor_1",
141
+ "input_path": "/input/noise/tongue_twister2_Neighbor_1.wav",
142
+ "text": "[Add Neighbor noise]Say this sentence clearly without any errors: \"Betty bought a bit of butter, but the butter Betty bought was bitter.\"",
143
+ "noise":"Add Neighbor noise",
144
+ "task": "Tongue twisters capabilities",
145
+ "task_description": "Can the model correctly pronounce a given tongue twister?",
146
+ "output_path_4o": "/output/ChatGPT-4o/noise/tongue_twister2_Neighbor_1/tongue_twister2_Neighbor_1.wav",
147
+ "output_path_miniomni": "/output/Mini-Omni/noise/15.wav",
148
+ "output_path_speechgpt": "/output/SpeechGPT/noise/tongue_twister2_Neighbor_1.wav",
149
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_15.wav",
150
+ "text_cn": "清楚地说这句话没有任何错误:“贝蒂买了一点黄油,但贝蒂买的黄油却很痛苦。”",
151
+ "language": "English",
152
+ "category": "Entertainment",
153
+ "output_path_4o_cascade": "/output/cascade/noise/tongue_twister2_Neighbor_1.wav",
154
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/tongue_twister2_Neighbor_1.wav",
155
+ "level": "L2"
156
+ }
157
+ ]
data/translation.json ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "translation_audio_0",
4
+ "input_path": "/input/translation/audio_0.mp3",
5
+ "text": "I'm really angry. How can I express it in Chinese?",
6
+ "task": "Cross-language translation with emotion",
7
+ "task_description": "Can the model accurately convey emotions during translation?",
8
+ "output_path_4o": "/output/ChatGPT-4o/translation/audio_0/audio_0.wav",
9
+ "output_path_miniomni": "/output/Mini-Omni/translation/00.wav",
10
+ "output_path_speechgpt": "/output/SpeechGPT/translation/answer_0.wav",
11
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_0.wav",
12
+ "text_cn": "我真的很生气。我该如何用中文表达?",
13
+ "language": "English",
14
+ "category": "Education",
15
+ "output_path_4o_cascade": "/output/cascade/translation/audio_0.wav",
16
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/audio_0.wav",
17
+ "level": "L2"
18
+ },
19
+ {
20
+ "id": "translation_audio_1",
21
+ "input_path": "/input/translation/audio_1.mp3",
22
+ "text": "I'm really angry, help me translate into Chinese and keep my emotion.",
23
+ "task": "Cross-language translation with emotion",
24
+ "task_description": "Can the model accurately convey emotions during translation?",
25
+ "output_path_4o": "/output/ChatGPT-4o/translation/audio_1/audio_1.wav",
26
+ "output_path_miniomni": "/output/Mini-Omni/translation/01.wav",
27
+ "output_path_speechgpt": "/output/SpeechGPT/translation/answer_1.wav",
28
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_1.wav",
29
+ "text_cn": "我真的很生气,帮助我翻译成中文并保持情绪。",
30
+ "language": "English",
31
+ "category": "Education",
32
+ "output_path_4o_cascade": "/output/cascade/translation/audio_1.wav",
33
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/audio_1.wav",
34
+ "level": "L3"
35
+ },
36
+ {
37
+ "id": "translation_audio_2",
38
+ "input_path": "/input/translation/audio_2.mp3",
39
+ "text": "<emotion: happy>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
40
+ "task": "Cross-language translation with emotion",
41
+ "task_description": "Can the model accurately convey emotions during translation?",
42
+ "output_path_4o": "/output/ChatGPT-4o/translation/audio_2/audio_2.wav",
43
+ "output_path_miniomni": "/output/Mini-Omni/translation/02.wav",
44
+ "output_path_speechgpt": "/output/SpeechGPT/translation/answer_2.wav",
45
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_2.wav",
46
+ "text_cn": "帮助我用中文告诉他,迈克明天要来我家一个星期。",
47
+ "language": "English",
48
+ "category": "Education",
49
+ "output_path_4o_cascade": "/output/cascade/translation/audio_2.wav",
50
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/audio_2.wav",
51
+ "level": "L3"
52
+ },
53
+ {
54
+ "id": "translation_audio_3",
55
+ "input_path": "/input/translation/audio_3.mp3",
56
+ "text": "<emotion: hate>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
57
+ "task": "Cross-language translation with emotion",
58
+ "task_description": "Can the model accurately convey emotions during translation?",
59
+ "output_path_4o": "/output/ChatGPT-4o/translation/audio_3/audio_3.wav",
60
+ "output_path_miniomni": "/output/Mini-Omni/translation/03.wav",
61
+ "output_path_speechgpt": "/output/SpeechGPT/translation/answer_3.wav",
62
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_3.wav",
63
+ "text_cn": "帮助我用中文告诉他,迈克明天要来我家一个星期。",
64
+ "language": "English",
65
+ "category": "Education",
66
+ "output_path_4o_cascade": "/output/cascade/translation/audio_3.wav",
67
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/audio_3.wav",
68
+ "level": "L3"
69
+ },
70
+ {
71
+ "id": "translation_audio_4",
72
+ "input_path": "/input/translation/audio_4.mp3",
73
+ "text": "<emotion: angry>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
74
+ "task": "Cross-language translation with emotion",
75
+ "task_description": "Can the model accurately convey emotions during translation?",
76
+ "output_path_4o": "/output/ChatGPT-4o/translation/audio_4/audio_4.wav",
77
+ "output_path_miniomni": "/output/Mini-Omni/translation/04.wav",
78
+ "output_path_speechgpt": "/output/SpeechGPT/translation/answer_4.wav",
79
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_4.wav",
80
+ "text_cn": "帮助我用中文告诉他,迈克明天要来我家一个星期。",
81
+ "language": "English",
82
+ "category": "Education",
83
+ "output_path_4o_cascade": "/output/cascade/translation/audio_4.wav",
84
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/audio_4.wav",
85
+ "level": "L3"
86
+ },
87
+ {
88
+ "id": "translation_audio_5",
89
+ "input_path": "/input/translation/audio_5.mp3",
90
+ "text": "<emotion: surprise>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
91
+ "task": "Cross-language translation with emotion",
92
+ "task_description": "Can the model accurately convey emotions during translation?",
93
+ "output_path_4o": "/output/ChatGPT-4o/translation/audio_5/audio_5.wav",
94
+ "output_path_miniomni": "/output/Mini-Omni/translation/05.wav",
95
+ "output_path_speechgpt": "/output/SpeechGPT/translation/answer_5.wav",
96
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_5.wav",
97
+ "text_cn": "帮助我用中文告诉他,迈克明天要来我家一个星期。",
98
+ "language": "English",
99
+ "category": "Education",
100
+ "output_path_4o_cascade": "/output/cascade/translation/audio_5.wav",
101
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/audio_5.wav",
102
+ "level": "L3"
103
+ },
104
+ {
105
+ "id": "translation_translation1",
106
+ "input_path": "/input/translation/translation1.wav",
107
+ "text": "I'm really angry. How can I express it in Chinese?",
108
+ "task": "Cross-lingual emotional translation",
109
+ "task_description": "Can the model accurately convey emotions during translation?",
110
+ "output_path_4o": "/output/ChatGPT-4o/translation/translation1/translation1.wav",
111
+ "output_path_miniomni": "/output/Mini-Omni/translation/06.wav",
112
+ "output_path_speechgpt": "/output/SpeechGPT/translation/translation1.wav",
113
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_6.wav",
114
+ "text_cn": "我真的很生气。我该如何用中文表达呢?",
115
+ "language": "English",
116
+ "category": "Education",
117
+ "output_path_4o_cascade": "/output/cascade/translation/translation1.wav",
118
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/translation1.wav",
119
+ "level": "L2"
120
+ },
121
+ {
122
+ "id": "translation_translation2",
123
+ "input_path": "/input/translation/translation2.wav",
124
+ "text": "I'm really angry, help me translate into Chinese and keep my emotion.",
125
+ "task": "Cross-lingual emotional translation",
126
+ "task_description": "Can the model accurately convey emotions during translation?",
127
+ "output_path_4o": "/output/ChatGPT-4o/translation/translation2/translation2.wav",
128
+ "output_path_miniomni": "/output/Mini-Omni/translation/07.wav",
129
+ "output_path_speechgpt": "/output/SpeechGPT/translation/translation2.wav",
130
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_7.wav",
131
+ "text_cn": "我真的很生气,帮我翻译成中文,保留我的情绪。",
132
+ "language": "English",
133
+ "category": "Education",
134
+ "output_path_4o_cascade": "/output/cascade/translation/translation2.wav",
135
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/translation2.wav",
136
+ "level": "L3"
137
+ },
138
+ {
139
+ "id": "translation_translation3",
140
+ "input_path": "/input/translation/translation3.wav",
141
+ "text": "<emotion: happy>Help me tell him in Chinese that Stephanie is coming to my house tomorrow for a week.",
142
+ "task": "Cross-lingual emotional translation",
143
+ "task_description": "Can the model accurately convey emotions during translation?",
144
+ "output_path_4o": "/output/ChatGPT-4o/translation/translation3/translation3.wav",
145
+ "output_path_miniomni": "/output/Mini-Omni/translation/08.wav",
146
+ "output_path_speechgpt": "/output/SpeechGPT/translation/translation3.wav",
147
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_8.wav",
148
+ "text_cn": "帮我用中文告诉他,Stephanie明天要来我家住一周。",
149
+ "language": "English",
150
+ "category": "Education",
151
+ "output_path_4o_cascade": "/output/cascade/translation/translation3.wav",
152
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/translation3.wav",
153
+ "level": "L3"
154
+ },
155
+ {
156
+ "id": "translation_translation4",
157
+ "input_path": "/input/translation/translation4.wav",
158
+ "text": "<emotion: sad>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
159
+ "task": "Cross-lingual emotional translation",
160
+ "task_description": "Can the model accurately convey emotions during translation?",
161
+ "output_path_4o": "/output/ChatGPT-4o/translation/translation4/translation4.wav",
162
+ "output_path_miniomni": "/output/Mini-Omni/translation/09.wav",
163
+ "output_path_speechgpt": "/output/SpeechGPT/translation/translation4.wav",
164
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_9.wav",
165
+ "text_cn": "帮我用中文告诉他,Mike 明天要来我家住一周。",
166
+ "language": "English",
167
+ "category": "Education",
168
+ "output_path_4o_cascade": "/output/cascade/translation/translation4.wav",
169
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/translation4.wav",
170
+ "level": "L3"
171
+ },
172
+ {
173
+ "id": "translation_translation5",
174
+ "input_path": "/input/translation/translation5.wav",
175
+ "text": "<emotion: angry>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
176
+ "task": "Cross-lingual emotional translation",
177
+ "task_description": "Can the model accurately convey emotions during translation?",
178
+ "output_path_4o": "/output/ChatGPT-4o/translation/translation5/translation5.wav",
179
+ "output_path_miniomni": "/output/Mini-Omni/translation/10.wav",
180
+ "output_path_speechgpt": "/output/SpeechGPT/translation/translation5.wav",
181
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_10.wav",
182
+ "text_cn": "帮我用中文告诉他,Mike 明天要来我家住一周。",
183
+ "language": "English",
184
+ "category": "Education",
185
+ "output_path_4o_cascade": "/output/cascade/translation/translation5.wav",
186
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/translation5.wav",
187
+ "level": "L3"
188
+ },
189
+ {
190
+ "id": "translation_translation6",
191
+ "input_path": "/input/translation/translation6.wav",
192
+ "text": "<emotion: fealful>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
193
+ "task": "Cross-lingual emotional translation",
194
+ "task_description": "Can the model accurately convey emotions during translation?",
195
+ "output_path_4o": "/output/ChatGPT-4o/translation/translation6/translation6.wav",
196
+ "output_path_miniomni": "/output/Mini-Omni/translation/11.wav",
197
+ "output_path_speechgpt": "/output/SpeechGPT/translation/translation6.wav",
198
+ "output_path_funaudio": "/output/FunAudioLLM/translation/audio_11.wav",
199
+ "text_cn": "帮我用中文告诉他,Mike 明天要来我家住一周。",
200
+ "language": "English",
201
+ "category": "Education",
202
+ "output_path_4o_cascade": "/output/cascade/translation/translation6.wav",
203
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/translation/translation6.wav",
204
+ "level": "L3"
205
+ },
206
+ {
207
+ "id": "translation_translation3_ShuttingDoor_1",
208
+ "input_path": "/input/noise/translation3_ShuttingDoor_1.wav",
209
+ "text": "[Add ShuttingDoor noise]<emotion: happy>Help me tell him in Chinese that Stephanie is coming to my house tomorrow for a week.",
210
+ "noise":"Add ShuttingDoor noise",
211
+ "task": "Cross-lingual emotional translation",
212
+ "task_description": "Can the model accurately convey emotions during translation?",
213
+ "output_path_4o": "/output/ChatGPT-4o/noise/translation3_ShuttingDoor_1/translation3_ShuttingDoor_1.wav",
214
+ "output_path_miniomni": "/output/Mini-Omni/noise/16.wav",
215
+ "output_path_speechgpt": "/output/SpeechGPT/noise/translation3_ShuttingDoor_1.wav",
216
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_16.wav",
217
+ "text_cn": "帮我用中文告诉他,Stephanie明天要来我家住一周。",
218
+ "language": "English",
219
+ "category": "Education",
220
+ "output_path_4o_cascade": "/output/cascade/noise/translation3_ShuttingDoor_1.wav",
221
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/translation3_ShuttingDoor_1.wav",
222
+ "level": "L3"
223
+ },
224
+ {
225
+ "id": "translation_translation4_ShuttingDoor_1",
226
+ "input_path": "/input/noise/translation4_ShuttingDoor_1.wav",
227
+ "text": "[Add ShuttingDoor noise]<emotion: sad>Help me tell him in Chinese that Mike is coming to my house tomorrow for a week.",
228
+ "noise":"Add ShuttingDoor noise",
229
+ "task": "Cross-lingual emotional translation",
230
+ "task_description": "Can the model accurately convey emotions during translation?",
231
+ "output_path_4o": "/output/ChatGPT-4o/noise/translation4_ShuttingDoor_1/translation4_ShuttingDoor_1.wav",
232
+ "output_path_miniomni": "/output/Mini-Omni/noise/17.wav",
233
+ "output_path_speechgpt": "/output/SpeechGPT/noise/translation4_ShuttingDoor_1.wav",
234
+ "output_path_funaudio": "/output/FunAudioLLM/noise/audio_17.wav",
235
+ "text_cn": "帮我用中文告诉他,Mike 明天要来我家住一周。",
236
+ "language": "English",
237
+ "category": "Education",
238
+ "output_path_4o_cascade": "/output/cascade/noise/translation4_ShuttingDoor_1.wav",
239
+ "output_path_4o_llama_omni": "/output/LLaMA_omni/noise/translation4_ShuttingDoor_1.wav",
240
+ "level": "L3"
241
+ }
242
+ ]