Spaces:
Sleeping
Sleeping
implicatures and tricky questions
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ import plotly.express as px
|
|
7 |
from st_social_media_links import SocialMediaIcons
|
8 |
|
9 |
|
|
|
10 |
RESULTS_COLUMN_NAME = "Results"
|
11 |
AVERAGE_COLUMN_NAME = "Average"
|
12 |
SENTIMENT_COLUMN_NAME = "Sentiment"
|
@@ -16,76 +17,60 @@ TRICKY_QUESTIONS_COLUMN_NAME = "Tricky questions"
|
|
16 |
IMPLICATURES_AVERAGE_COLUMN_NAME = "Implicatures average"
|
17 |
|
18 |
# Function to load data from JSON file
|
|
|
|
|
19 |
@st.cache_data
|
20 |
def load_data(file_path):
|
21 |
with open(file_path, 'r', encoding='utf-8') as file:
|
22 |
data = json.load(file)
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Function to style the DataFrame
|
|
|
|
|
26 |
@st.cache_data
|
27 |
def style_dataframe(df: pd.DataFrame):
|
28 |
-
|
29 |
-
|
30 |
-
lambda row: (row[SENTIMENT_COLUMN_NAME] + row[UNDERSTANDING_COLUMN_NAME] + row[PHRASEOLOGY_COLUMN_NAME]) / 3,
|
31 |
-
axis=1
|
32 |
-
)
|
33 |
-
|
34 |
-
# Calculate Average from all four columns
|
35 |
-
df[AVERAGE_COLUMN_NAME] = df.apply(
|
36 |
-
lambda row: (row[SENTIMENT_COLUMN_NAME] + row[UNDERSTANDING_COLUMN_NAME] +
|
37 |
-
row[PHRASEOLOGY_COLUMN_NAME] + row[TRICKY_QUESTIONS_COLUMN_NAME]) / 4,
|
38 |
-
axis=1
|
39 |
-
)
|
40 |
-
|
41 |
-
df[RESULTS_COLUMN_NAME] = df.apply(
|
42 |
-
lambda row: [row[SENTIMENT_COLUMN_NAME], row[UNDERSTANDING_COLUMN_NAME],
|
43 |
-
row[PHRASEOLOGY_COLUMN_NAME], row[TRICKY_QUESTIONS_COLUMN_NAME]],
|
44 |
-
axis=1
|
45 |
-
)
|
46 |
-
|
47 |
-
# Insert the new column after the 'Average' column
|
48 |
cols = list(df.columns)
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
cols.insert(
|
60 |
-
|
|
|
61 |
df = df[cols]
|
|
|
62 |
return df
|
63 |
|
|
|
64 |
def styler(df: pd.DataFrame):
|
65 |
palette = sns.color_palette("RdYlGn", as_cmap=True)
|
66 |
# Apply reverse color gradient to the "Params" column
|
67 |
-
params_palette = sns.color_palette(
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
).set_properties(
|
75 |
-
**{'text-align': 'center'},
|
76 |
-
subset=[AVERAGE_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
|
77 |
-
PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
|
78 |
-
).format(
|
79 |
-
"{:.2f}".center(10),
|
80 |
-
subset=[AVERAGE_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
|
81 |
-
PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
|
82 |
-
).format(
|
83 |
-
"{:.1f}".center(10), subset=["Params"]
|
84 |
-
)
|
85 |
return styled_df
|
86 |
|
87 |
|
88 |
-
|
89 |
st.set_page_config(layout="wide")
|
90 |
|
91 |
st.markdown("""
|
@@ -100,7 +85,7 @@ st.markdown("""
|
|
100 |
</style>
|
101 |
""", unsafe_allow_html=True)
|
102 |
|
103 |
-
|
104 |
|
105 |
st.markdown("""
|
106 |
<style>
|
@@ -113,12 +98,10 @@ st.markdown("""
|
|
113 |
.center-text {
|
114 |
text-align: center;
|
115 |
}
|
116 |
-
|
117 |
a:link {color:#FDA428;} /* unvisited link */
|
118 |
a:hover {color:#FDA428;} /* Mouse over link */
|
119 |
a:visited {color:#FDA428;} /* visited link */
|
120 |
a:active {color:#FDA428;} /* selected link */
|
121 |
-
|
122 |
</style>
|
123 |
""", unsafe_allow_html=True)
|
124 |
|
@@ -131,7 +114,7 @@ st.markdown("""
|
|
131 |
# ----------------------------------------------------------
|
132 |
st.markdown("""<br>""", unsafe_allow_html=True)
|
133 |
|
134 |
-
|
135 |
social_media_links = [
|
136 |
"https://discord.com/invite/ZJwCMrxwT7",
|
137 |
"https://github.com/speakleash",
|
@@ -154,7 +137,8 @@ social_media_links_colors = [
|
|
154 |
links_color
|
155 |
]
|
156 |
|
157 |
-
social_media_icons = SocialMediaIcons(
|
|
|
158 |
social_media_icons.render(justify_content='right')
|
159 |
|
160 |
st.markdown("""
|
@@ -184,46 +168,86 @@ with tab1:
|
|
184 |
|
185 |
# Prepare data
|
186 |
data = load_data('data.json')
|
187 |
-
|
188 |
-
data['Params'] =
|
|
|
|
|
|
|
189 |
data = data.sort_values(by=AVERAGE_COLUMN_NAME, ascending=False)
|
190 |
|
191 |
# Closing filters in a expander
|
192 |
with st.expander("Filtering benchmark data", icon='🔍'):
|
193 |
# Filtering data, e.g. slider for params, average score, etc.
|
194 |
-
col_filter_params, col_filter_average, col_filter_sentiment, col_filter_understanding, col_filter_phraseology,
|
195 |
-
|
|
|
196 |
with col_filter_params:
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
with col_filter_average:
|
201 |
-
average_slider = st.slider(
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
with col_filter_sentiment:
|
205 |
-
sentiment_slider = st.slider(
|
206 |
-
|
|
|
|
|
207 |
|
208 |
with col_filter_understanding:
|
209 |
-
understanding_slider = st.slider(
|
210 |
-
|
|
|
|
|
211 |
|
212 |
with col_filter_phraseology:
|
213 |
-
phraseology_slider = st.slider(
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
|
|
|
|
|
|
|
|
219 |
|
220 |
# Extract unique provider names from the "Model" column
|
221 |
-
providers = data["Model"].apply(
|
222 |
-
|
|
|
|
|
223 |
# Filter data based on selected providers
|
224 |
-
data = data[data["Model"].apply(lambda x: x.split(
|
|
|
225 |
|
226 |
-
|
227 |
styled_df_show = style_dataframe(data)
|
228 |
styled_df_show = styler(styled_df_show)
|
229 |
|
@@ -233,23 +257,26 @@ with tab1:
|
|
233 |
AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(AVERAGE_COLUMN_NAME),
|
234 |
RESULTS_COLUMN_NAME: st.column_config.BarChartColumn(
|
235 |
"Bar chart of results", help="Summary of the results of each task",
|
236 |
-
y_min=0,y_max=5,),
|
237 |
SENTIMENT_COLUMN_NAME: st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment'),
|
238 |
UNDERSTANDING_COLUMN_NAME: st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language'),
|
239 |
PHRASEOLOGY_COLUMN_NAME: st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds'),
|
240 |
TRICKY_QUESTIONS_COLUMN_NAME: st.column_config.NumberColumn(TRICKY_QUESTIONS_COLUMN_NAME, help='Ability to understand tricky questions'),
|
241 |
-
|
242 |
-
}, hide_index=True, disabled=True, height=500)
|
243 |
|
244 |
# Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
|
245 |
# Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
|
246 |
-
default_models = list(data.sort_values(
|
247 |
-
|
|
|
|
|
248 |
if bielik_model not in default_models:
|
249 |
default_models.append(bielik_model)
|
250 |
-
selected_models = st.multiselect(
|
|
|
251 |
selected_data = data[data["Model"].isin(selected_models)]
|
252 |
-
categories = [AVERAGE_COLUMN_NAME,
|
|
|
253 |
|
254 |
if selected_models:
|
255 |
# Kolorki do wyboru:
|
@@ -258,7 +285,8 @@ with tab1:
|
|
258 |
|
259 |
fig_bars = go.Figure()
|
260 |
for model, color in zip(selected_models, colors):
|
261 |
-
values = selected_data[selected_data['Model'] ==
|
|
|
262 |
fig_bars.add_trace(go.Bar(
|
263 |
x=categories,
|
264 |
y=values,
|
@@ -269,7 +297,8 @@ with tab1:
|
|
269 |
# Update layout to use a custom color scale
|
270 |
fig_bars.update_layout(
|
271 |
showlegend=True,
|
272 |
-
legend=dict(orientation="h", yanchor="top",
|
|
|
273 |
title="Comparison of Selected Models",
|
274 |
yaxis_title="Score",
|
275 |
template="plotly_dark"
|
@@ -278,29 +307,25 @@ with tab1:
|
|
278 |
st.plotly_chart(fig_bars)
|
279 |
|
280 |
|
281 |
-
|
282 |
with tab2:
|
283 |
st.markdown("""
|
284 |
### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span>
|
285 |
1. **Need**: Models face significant challenges when dealing with understanding complex, context-reliant texts that involve meanings implied beyond the literal content of a statement. Such cases include sarcasm, implicatures, and phraseological compounds.
|
286 |
-
|
287 |
Traditional sentiment classifiers typically rely on word-based features (e.g., identifying positive or negative words) to assess sentiment. However, with sarcasm, the literal meaning of words often contradicts the intended sentiment, making it difficult for models to accurately gauge tone. Sarcasm's context-dependence further complicates matters, as these classifiers typically lack the ability to grasp nuanced cues in context, especially when sarcasm is subtle.
|
288 |
Similarly, classifiers struggle with implicatures, where the underlying intent is implied rather than explicitly stated. Here, models fail to capture the full sentiment because they rely heavily on surface-level words, missing the non-literal meaning that often drives the sentiment.
|
289 |
Phraseological compounds add another layer of difficulty. These are fixed or semi-fixed expressions whose meanings cannot be directly inferred from the individual words. Language models, trained on word-level patterns, often misinterpret these expressions because they fail to recognize the idiomatic or non-literal meaning, leading to inaccurate sentiment analysis.
|
290 |
In addition to sentiment analysis, we decided to include the understanding of more complex texts in the benchmark, which was measured by the ability to uncover the intended meaning.
|
291 |
-
|
292 |
### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span>
|
293 |
The dataset contains 200 examples, all written in Polish. Each example consists of the following:
|
294 |
- **Main Text**: This is a statement (often an opinion) on any topic that includes a certain type of implicature, often several simultaneously, such as sarcasm or phraseological compounds.
|
295 |
- **Reference Sentiment**: The sentiment associated with the main text. We use three categories: negative, neutral, and positive. Ambiguous examples were labeled as "neutral" to exclude them from sentiment classification testing.
|
296 |
- **Reference phraseological compounds**: A list of phraseological compounds found in the main text.
|
297 |
- **Reference Explanation**: An explanation of the underlying intentions that the author of the main text might have had.
|
298 |
-
|
299 |
### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span>
|
300 |
We distinguish between two models in the evaluation process:
|
301 |
- **Evaluated Model**: The model that performs specific tasks, is then assessed based on its performance, and added to a ranking.
|
302 |
- **Judge Metamodel**: One of the currently strongest, most versatile LLMs.
|
303 |
-
|
304 |
### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING RESPONSES FROM THE EVALUATED MODEL**</span>
|
305 |
1. For each text in the dataset, the evaluated model was required to list the following in three points:
|
306 |
- The sentiment (only positive/negative).
|
@@ -316,7 +341,6 @@ We distinguish between two models in the evaluation process:
|
|
316 |
- **Assistant Prompt**: A human-written example answer for the second example text.
|
317 |
- **User Prompt**: The target text, based on which the evaluated model will be assessed.
|
318 |
3. The decision to split the examples into user prompts and assistant prompts was made due to the better results achieved by the vast majority of models. The two examples were selected based on diversity: one has a negative sentiment and several phraseological compounds, while the other is positive and lacks phraseological compounds.
|
319 |
-
|
320 |
### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING METAMODEL EVALUATIONS**</span>
|
321 |
1. The purpose of the metamodel is to return the following evaluations:
|
322 |
- **Understanding of the Text**: A comparison of the evaluated model's response description to the reference explanation.
|
@@ -356,9 +380,10 @@ st.markdown("""
|
|
356 |
- [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/) - methodological support
|
357 |
- [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/) - engineering, methodological support
|
358 |
- [Szymon Baczyński](https://www.linkedin.com/in/szymon-baczynski/) - front-end / streamlit assistant
|
|
|
359 |
- [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/) - writing text, linguistic support
|
360 |
""")
|
361 |
|
362 |
st.divider()
|
363 |
|
364 |
-
# Run the app with `streamlit run your_script.py`
|
|
|
7 |
from st_social_media_links import SocialMediaIcons
|
8 |
|
9 |
|
10 |
+
PARAMS_COLUMN_NAME = "Params"
|
11 |
RESULTS_COLUMN_NAME = "Results"
|
12 |
AVERAGE_COLUMN_NAME = "Average"
|
13 |
SENTIMENT_COLUMN_NAME = "Sentiment"
|
|
|
17 |
IMPLICATURES_AVERAGE_COLUMN_NAME = "Implicatures average"
|
18 |
|
19 |
# Function to load data from JSON file
|
20 |
+
|
21 |
+
|
22 |
@st.cache_data
|
23 |
def load_data(file_path):
|
24 |
with open(file_path, 'r', encoding='utf-8') as file:
|
25 |
data = json.load(file)
|
26 |
+
df = pd.DataFrame(data)
|
27 |
+
df[AVERAGE_COLUMN_NAME] = df[['Sentiment',
|
28 |
+
'Language understanding', 'Phraseology', 'Tricky questions']].mean(axis=1)
|
29 |
+
|
30 |
+
df[IMPLICATURES_AVERAGE_COLUMN_NAME] = df[['Sentiment',
|
31 |
+
'Language understanding', 'Phraseology']].mean(axis=1)
|
32 |
+
return df
|
33 |
|
34 |
# Function to style the DataFrame
|
35 |
+
|
36 |
+
|
37 |
@st.cache_data
|
38 |
def style_dataframe(df: pd.DataFrame):
|
39 |
+
df[RESULTS_COLUMN_NAME] = df.apply(lambda row: [
|
40 |
+
row[SENTIMENT_COLUMN_NAME], row[UNDERSTANDING_COLUMN_NAME], row[PHRASEOLOGY_COLUMN_NAME], row[TRICKY_QUESTIONS_COLUMN_NAME]], axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
cols = list(df.columns)
|
42 |
+
|
43 |
+
# move average column
|
44 |
+
cols.insert(cols.index(PARAMS_COLUMN_NAME) + 1,
|
45 |
+
cols.pop(cols.index(AVERAGE_COLUMN_NAME)))
|
46 |
+
|
47 |
+
# move impicatures average column
|
48 |
+
cols.insert(cols.index(AVERAGE_COLUMN_NAME) + 1,
|
49 |
+
cols.pop(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME)))
|
50 |
+
|
51 |
+
# move results column
|
52 |
+
cols.insert(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME) + 1,
|
53 |
+
cols.pop(cols.index(RESULTS_COLUMN_NAME)))
|
54 |
+
# Insert the new column after the 'Average' column
|
55 |
df = df[cols]
|
56 |
+
# Create a color ramp using Seaborn
|
57 |
return df
|
58 |
|
59 |
+
|
60 |
def styler(df: pd.DataFrame):
|
61 |
palette = sns.color_palette("RdYlGn", as_cmap=True)
|
62 |
# Apply reverse color gradient to the "Params" column
|
63 |
+
params_palette = sns.color_palette(
|
64 |
+
"RdYlGn_r", as_cmap=True) # Reversed RdYlGn palette
|
65 |
+
styled_df = df.style.background_gradient(cmap=palette, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
|
66 |
+
).background_gradient(cmap=params_palette, subset=["Params"]
|
67 |
+
).set_properties(**{'text-align': 'center'}, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
|
68 |
+
).format("{:.2f}".center(10), subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
|
69 |
+
).format("{:.1f}".center(10), subset=["Params"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
return styled_df
|
71 |
|
72 |
|
73 |
+
# Streamlit app
|
74 |
st.set_page_config(layout="wide")
|
75 |
|
76 |
st.markdown("""
|
|
|
85 |
</style>
|
86 |
""", unsafe_allow_html=True)
|
87 |
|
88 |
+
# Prepare layout
|
89 |
|
90 |
st.markdown("""
|
91 |
<style>
|
|
|
98 |
.center-text {
|
99 |
text-align: center;
|
100 |
}
|
|
|
101 |
a:link {color:#FDA428;} /* unvisited link */
|
102 |
a:hover {color:#FDA428;} /* Mouse over link */
|
103 |
a:visited {color:#FDA428;} /* visited link */
|
104 |
a:active {color:#FDA428;} /* selected link */
|
|
|
105 |
</style>
|
106 |
""", unsafe_allow_html=True)
|
107 |
|
|
|
114 |
# ----------------------------------------------------------
|
115 |
st.markdown("""<br>""", unsafe_allow_html=True)
|
116 |
|
117 |
+
# Row: 1 --> Title + links to SpeakLeash.org website / GitHub / X (Twitter)
|
118 |
social_media_links = [
|
119 |
"https://discord.com/invite/ZJwCMrxwT7",
|
120 |
"https://github.com/speakleash",
|
|
|
137 |
links_color
|
138 |
]
|
139 |
|
140 |
+
social_media_icons = SocialMediaIcons(
|
141 |
+
social_media_links, social_media_links_colors)
|
142 |
social_media_icons.render(justify_content='right')
|
143 |
|
144 |
st.markdown("""
|
|
|
168 |
|
169 |
# Prepare data
|
170 |
data = load_data('data.json')
|
171 |
+
|
172 |
+
data['Params'] = pd.to_numeric(
|
173 |
+
data['Params'].str.replace('B', ''),
|
174 |
+
errors='coerce'
|
175 |
+
)
|
176 |
data = data.sort_values(by=AVERAGE_COLUMN_NAME, ascending=False)
|
177 |
|
178 |
# Closing filters in a expander
|
179 |
with st.expander("Filtering benchmark data", icon='🔍'):
|
180 |
# Filtering data, e.g. slider for params, average score, etc.
|
181 |
+
col_filter_params, col_filter_average, col_filter_implicatures_average, col_filter_sentiment, col_filter_understanding, col_filter_phraseology, col_filter_tricky_questions = st.columns(
|
182 |
+
7, gap='medium')
|
183 |
+
|
184 |
with col_filter_params:
|
185 |
+
max_params = data['Params'].max(skipna=True)
|
186 |
+
if pd.isna(max_params):
|
187 |
+
max_params = 0.0
|
188 |
+
|
189 |
+
params_slider = st.slider(
|
190 |
+
"Models Size [B]",
|
191 |
+
min_value=0.0,
|
192 |
+
max_value=float(max_params),
|
193 |
+
value=(0.0, float(max_params)),
|
194 |
+
step=0.1,
|
195 |
+
format="%.1f"
|
196 |
+
)
|
197 |
+
data = data[
|
198 |
+
data['Params'].isna() |
|
199 |
+
(
|
200 |
+
(data['Params'] >= params_slider[0]) &
|
201 |
+
(data['Params'] <= params_slider[1])
|
202 |
+
)
|
203 |
+
]
|
204 |
|
205 |
with col_filter_average:
|
206 |
+
average_slider = st.slider(
|
207 |
+
"Average score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
|
208 |
+
data = data[(data[AVERAGE_COLUMN_NAME] >= average_slider[0]) & (
|
209 |
+
data[AVERAGE_COLUMN_NAME] <= average_slider[1])]
|
210 |
+
|
211 |
+
with col_filter_implicatures_average:
|
212 |
+
implicatures_average_slider = st.slider(
|
213 |
+
"Implicatures average", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
|
214 |
+
data = data[(data[IMPLICATURES_AVERAGE_COLUMN_NAME] >= implicatures_average_slider[0]) & (
|
215 |
+
data[IMPLICATURES_AVERAGE_COLUMN_NAME] <= implicatures_average_slider[1])]
|
216 |
|
217 |
with col_filter_sentiment:
|
218 |
+
sentiment_slider = st.slider(
|
219 |
+
"Sentiment score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
|
220 |
+
data = data[(data[SENTIMENT_COLUMN_NAME] >= sentiment_slider[0]) & (
|
221 |
+
data[SENTIMENT_COLUMN_NAME] <= sentiment_slider[1])]
|
222 |
|
223 |
with col_filter_understanding:
|
224 |
+
understanding_slider = st.slider(
|
225 |
+
"Understanding score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
|
226 |
+
data = data[(data[UNDERSTANDING_COLUMN_NAME] >= understanding_slider[0]) & (
|
227 |
+
data[UNDERSTANDING_COLUMN_NAME] <= understanding_slider[1])]
|
228 |
|
229 |
with col_filter_phraseology:
|
230 |
+
phraseology_slider = st.slider(
|
231 |
+
"Phraseology score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
|
232 |
+
data = data[(data[PHRASEOLOGY_COLUMN_NAME] >= phraseology_slider[0]) & (
|
233 |
+
data[PHRASEOLOGY_COLUMN_NAME] <= phraseology_slider[1])]
|
234 |
+
|
235 |
+
with col_filter_tricky_questions:
|
236 |
+
tricky_questions_slider = st.slider(
|
237 |
+
"Tricky questions score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
|
238 |
+
data = data[(data[TRICKY_QUESTIONS_COLUMN_NAME] >= tricky_questions_slider[0]) & (
|
239 |
+
data[TRICKY_QUESTIONS_COLUMN_NAME] <= tricky_questions_slider[1])]
|
240 |
|
241 |
# Extract unique provider names from the "Model" column
|
242 |
+
providers = data["Model"].apply(
|
243 |
+
lambda x: x.split('/')[0].lower()).unique()
|
244 |
+
selected_providers = st.multiselect(
|
245 |
+
"Model providers", providers, default=providers)
|
246 |
# Filter data based on selected providers
|
247 |
+
data = data[data["Model"].apply(lambda x: x.split(
|
248 |
+
'/')[0].lower()).isin(selected_providers)]
|
249 |
|
250 |
+
# Display data
|
251 |
styled_df_show = style_dataframe(data)
|
252 |
styled_df_show = styler(styled_df_show)
|
253 |
|
|
|
257 |
AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(AVERAGE_COLUMN_NAME),
|
258 |
RESULTS_COLUMN_NAME: st.column_config.BarChartColumn(
|
259 |
"Bar chart of results", help="Summary of the results of each task",
|
260 |
+
y_min=0, y_max=5,),
|
261 |
SENTIMENT_COLUMN_NAME: st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment'),
|
262 |
UNDERSTANDING_COLUMN_NAME: st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language'),
|
263 |
PHRASEOLOGY_COLUMN_NAME: st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds'),
|
264 |
TRICKY_QUESTIONS_COLUMN_NAME: st.column_config.NumberColumn(TRICKY_QUESTIONS_COLUMN_NAME, help='Ability to understand tricky questions'),
|
265 |
+
}, hide_index=True, disabled=True, height=500)
|
|
|
266 |
|
267 |
# Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
|
268 |
# Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
|
269 |
+
default_models = list(data.sort_values(
|
270 |
+
AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3))
|
271 |
+
bielik_model = data[data['Model'].str.contains('Bielik')].sort_values(
|
272 |
+
AVERAGE_COLUMN_NAME, ascending=False)['Model'].iloc[0]
|
273 |
if bielik_model not in default_models:
|
274 |
default_models.append(bielik_model)
|
275 |
+
selected_models = st.multiselect(
|
276 |
+
"Select models to compare", data["Model"].unique(), default=default_models)
|
277 |
selected_data = data[data["Model"].isin(selected_models)]
|
278 |
+
categories = [AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
|
279 |
+
PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
|
280 |
|
281 |
if selected_models:
|
282 |
# Kolorki do wyboru:
|
|
|
285 |
|
286 |
fig_bars = go.Figure()
|
287 |
for model, color in zip(selected_models, colors):
|
288 |
+
values = selected_data[selected_data['Model'] ==
|
289 |
+
model][categories].values.flatten().tolist()
|
290 |
fig_bars.add_trace(go.Bar(
|
291 |
x=categories,
|
292 |
y=values,
|
|
|
297 |
# Update layout to use a custom color scale
|
298 |
fig_bars.update_layout(
|
299 |
showlegend=True,
|
300 |
+
legend=dict(orientation="h", yanchor="top",
|
301 |
+
y=-0.3, xanchor="center", x=0.5),
|
302 |
title="Comparison of Selected Models",
|
303 |
yaxis_title="Score",
|
304 |
template="plotly_dark"
|
|
|
307 |
st.plotly_chart(fig_bars)
|
308 |
|
309 |
|
310 |
+
# Zakładka 2 --> Opis
|
311 |
with tab2:
|
312 |
st.markdown("""
|
313 |
### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span>
|
314 |
1. **Need**: Models face significant challenges when dealing with understanding complex, context-reliant texts that involve meanings implied beyond the literal content of a statement. Such cases include sarcasm, implicatures, and phraseological compounds.
|
|
|
315 |
Traditional sentiment classifiers typically rely on word-based features (e.g., identifying positive or negative words) to assess sentiment. However, with sarcasm, the literal meaning of words often contradicts the intended sentiment, making it difficult for models to accurately gauge tone. Sarcasm's context-dependence further complicates matters, as these classifiers typically lack the ability to grasp nuanced cues in context, especially when sarcasm is subtle.
|
316 |
Similarly, classifiers struggle with implicatures, where the underlying intent is implied rather than explicitly stated. Here, models fail to capture the full sentiment because they rely heavily on surface-level words, missing the non-literal meaning that often drives the sentiment.
|
317 |
Phraseological compounds add another layer of difficulty. These are fixed or semi-fixed expressions whose meanings cannot be directly inferred from the individual words. Language models, trained on word-level patterns, often misinterpret these expressions because they fail to recognize the idiomatic or non-literal meaning, leading to inaccurate sentiment analysis.
|
318 |
In addition to sentiment analysis, we decided to include the understanding of more complex texts in the benchmark, which was measured by the ability to uncover the intended meaning.
|
|
|
319 |
### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span>
|
320 |
The dataset contains 200 examples, all written in Polish. Each example consists of the following:
|
321 |
- **Main Text**: This is a statement (often an opinion) on any topic that includes a certain type of implicature, often several simultaneously, such as sarcasm or phraseological compounds.
|
322 |
- **Reference Sentiment**: The sentiment associated with the main text. We use three categories: negative, neutral, and positive. Ambiguous examples were labeled as "neutral" to exclude them from sentiment classification testing.
|
323 |
- **Reference phraseological compounds**: A list of phraseological compounds found in the main text.
|
324 |
- **Reference Explanation**: An explanation of the underlying intentions that the author of the main text might have had.
|
|
|
325 |
### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span>
|
326 |
We distinguish between two models in the evaluation process:
|
327 |
- **Evaluated Model**: The model that performs specific tasks, is then assessed based on its performance, and added to a ranking.
|
328 |
- **Judge Metamodel**: One of the currently strongest, most versatile LLMs.
|
|
|
329 |
### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING RESPONSES FROM THE EVALUATED MODEL**</span>
|
330 |
1. For each text in the dataset, the evaluated model was required to list the following in three points:
|
331 |
- The sentiment (only positive/negative).
|
|
|
341 |
- **Assistant Prompt**: A human-written example answer for the second example text.
|
342 |
- **User Prompt**: The target text, based on which the evaluated model will be assessed.
|
343 |
3. The decision to split the examples into user prompts and assistant prompts was made due to the better results achieved by the vast majority of models. The two examples were selected based on diversity: one has a negative sentiment and several phraseological compounds, while the other is positive and lacks phraseological compounds.
|
|
|
344 |
### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING METAMODEL EVALUATIONS**</span>
|
345 |
1. The purpose of the metamodel is to return the following evaluations:
|
346 |
- **Understanding of the Text**: A comparison of the evaluated model's response description to the reference explanation.
|
|
|
380 |
- [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/) - methodological support
|
381 |
- [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/) - engineering, methodological support
|
382 |
- [Szymon Baczyński](https://www.linkedin.com/in/szymon-baczynski/) - front-end / streamlit assistant
|
383 |
+
- [Artur Słomowski](https://www.linkedin.com/in/arturslomowski/) - front-end / streamlit assistant
|
384 |
- [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/) - writing text, linguistic support
|
385 |
""")
|
386 |
|
387 |
st.divider()
|
388 |
|
389 |
+
# Run the app with `streamlit run your_script.py`
|
data.json
CHANGED
@@ -2,523 +2,497 @@
|
|
2 |
{
|
3 |
"Model": "mistralai/Mistral-Large-Instruct-2407",
|
4 |
"Params": "123B",
|
5 |
-
"Average": 4.03025641025641,
|
6 |
"Sentiment": 4.230769230769231,
|
7 |
"Language understanding": 4.0,
|
8 |
"Phraseology": 3.86,
|
9 |
-
"Tricky questions":
|
10 |
},
|
11 |
{
|
12 |
"Model": "alpindale/WizardLM-2-8x22B",
|
13 |
"Params": "141B",
|
14 |
-
"Average": 3.9133760683760683,
|
15 |
"Sentiment": 3.7051282051282053,
|
16 |
"Language understanding": 3.815,
|
17 |
"Phraseology": 4.22,
|
18 |
-
"Tricky questions":
|
19 |
},
|
20 |
{
|
21 |
"Model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
22 |
"Params": "70.6B",
|
23 |
-
"Average": 3.828974358974359,
|
24 |
"Sentiment": 4.326923076923077,
|
25 |
"Language understanding": 3.91,
|
26 |
"Phraseology": 3.25,
|
27 |
-
"Tricky questions":
|
28 |
},
|
29 |
{
|
30 |
"Model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
31 |
"Params": "70.6B",
|
32 |
-
"Average": 3.806538461538462,
|
33 |
"Sentiment": 4.134615384615385,
|
34 |
"Language understanding": 3.82,
|
35 |
"Phraseology": 3.465,
|
36 |
-
"Tricky questions":
|
37 |
},
|
38 |
{
|
39 |
"Model": "speakleash/Bielik-11B-v2.3-Instruct",
|
40 |
"Params": "11.2B",
|
41 |
-
"Average": 3.7697863247863252,
|
42 |
"Sentiment": 3.9743589743589745,
|
43 |
"Language understanding": 3.785,
|
44 |
"Phraseology": 3.55,
|
45 |
-
"Tricky questions":
|
46 |
},
|
47 |
{
|
48 |
"Model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
|
49 |
"Params": "141B",
|
50 |
-
"Average": 3.6690170940170943,
|
51 |
"Sentiment": 3.782051282051282,
|
52 |
"Language understanding": 3.675,
|
53 |
"Phraseology": 3.55,
|
54 |
-
"Tricky questions":
|
55 |
},
|
56 |
{
|
57 |
"Model": "speakleash/Bielik-11B-v2.1-Instruct",
|
58 |
"Params": "11.2B",
|
59 |
-
"Average": 3.6583760683760684,
|
60 |
"Sentiment": 3.9551282051282053,
|
61 |
"Language understanding": 3.915,
|
62 |
"Phraseology": 3.105,
|
63 |
-
"Tricky questions":
|
64 |
},
|
65 |
{
|
66 |
"Model": "Qwen/Qwen2-72B-Instruct",
|
67 |
"Params": "72.7B",
|
68 |
-
"Average": 3.6442735042735044,
|
69 |
"Sentiment": 3.7628205128205128,
|
70 |
"Language understanding": 3.89,
|
71 |
"Phraseology": 3.28,
|
72 |
-
"Tricky questions":
|
73 |
},
|
74 |
{
|
75 |
"Model": "speakleash/Bielik-11B-v2.0-Instruct",
|
76 |
"Params": "11.2B",
|
77 |
-
"Average": 3.614786324786325,
|
78 |
"Sentiment": 3.9743589743589745,
|
79 |
"Language understanding": 3.745,
|
80 |
"Phraseology": 3.125,
|
81 |
-
"Tricky questions":
|
82 |
},
|
83 |
{
|
84 |
"Model": "speakleash/Bielik-11B-v2.2-Instruct",
|
85 |
"Params": "11.2B",
|
86 |
-
"Average": 3.565982905982906,
|
87 |
"Sentiment": 3.717948717948718,
|
88 |
"Language understanding": 3.73,
|
89 |
"Phraseology": 3.25,
|
90 |
-
"Tricky questions":
|
91 |
},
|
92 |
{
|
93 |
"Model": "Qwen/Qwen1.5-72B-Chat",
|
94 |
"Params": "72.3B",
|
95 |
-
"Average": 3.3214529914529916,
|
96 |
"Sentiment": 3.4743589743589745,
|
97 |
"Language understanding": 3.515,
|
98 |
"Phraseology": 2.975,
|
99 |
-
"Tricky questions":
|
100 |
},
|
101 |
{
|
102 |
"Model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
103 |
"Params": "8.03B",
|
104 |
-
"Average": 3.3114529914529918,
|
105 |
"Sentiment": 3.9743589743589745,
|
106 |
"Language understanding": 3.38,
|
107 |
"Phraseology": 2.58,
|
108 |
-
"Tricky questions":
|
109 |
},
|
110 |
{
|
111 |
"Model": "THUDM/glm-4-9b-chat",
|
112 |
"Params": "9.4B",
|
113 |
-
"Average": 3.2749145299145295,
|
114 |
"Sentiment": 3.58974358974359,
|
115 |
"Language understanding": 3.455,
|
116 |
"Phraseology": 2.78,
|
117 |
-
"Tricky questions":
|
118 |
},
|
119 |
{
|
120 |
"Model": "mistralai/Mistral-Nemo-Instruct-2407",
|
121 |
"Params": "12.2B",
|
122 |
-
"Average": 3.223675213675214,
|
123 |
"Sentiment": 3.641025641025641,
|
124 |
"Language understanding": 3.29,
|
125 |
"Phraseology": 2.74,
|
126 |
-
"Tricky questions":
|
127 |
},
|
128 |
{
|
129 |
"Model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
130 |
"Params": "8.03B",
|
131 |
-
"Average": 3.172777777777778,
|
132 |
"Sentiment": 3.3333333333333335,
|
133 |
"Language understanding": 3.15,
|
134 |
"Phraseology": 3.035,
|
135 |
-
"Tricky questions":
|
136 |
},
|
137 |
{
|
138 |
"Model": "upstage/SOLAR-10.7B-Instruct-v1.0",
|
139 |
"Params": "10.7B",
|
140 |
-
"Average": 3.1343162393162394,
|
141 |
"Sentiment": 2.967948717948718,
|
142 |
"Language understanding": 3.18,
|
143 |
"Phraseology": 3.255,
|
144 |
-
"Tricky questions":
|
145 |
},
|
146 |
{
|
147 |
"Model": "speakleash/Bielik-7B-Instruct-v0.1",
|
148 |
"Params": "7.24B",
|
149 |
-
"Average": 3.126581196581197,
|
150 |
"Sentiment": 3.58974358974359,
|
151 |
"Language understanding": 3.475,
|
152 |
"Phraseology": 2.315,
|
153 |
-
"Tricky questions":
|
154 |
},
|
155 |
{
|
156 |
"Model": "openchat/openchat-3.5-0106-gemma",
|
157 |
"Params": "8.54B",
|
158 |
-
"Average": 3.08525641025641,
|
159 |
"Sentiment": 3.730769230769231,
|
160 |
"Language understanding": 3.08,
|
161 |
"Phraseology": 2.445,
|
162 |
-
"Tricky questions":
|
163 |
},
|
164 |
{
|
165 |
"Model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
166 |
"Params": "46.7B",
|
167 |
-
"Average": 3.039230769230769,
|
168 |
"Sentiment": 3.0576923076923075,
|
169 |
"Language understanding": 3.175,
|
170 |
"Phraseology": 2.885,
|
171 |
-
"Tricky questions":
|
172 |
},
|
173 |
{
|
174 |
"Model": "mistralai/Mistral-7B-Instruct-v0.3",
|
175 |
"Params": "7.25B",
|
176 |
-
"Average": 3.022307692307692,
|
177 |
"Sentiment": 3.326923076923077,
|
178 |
"Language understanding": 3.06,
|
179 |
"Phraseology": 2.68,
|
180 |
-
"Tricky questions":
|
181 |
},
|
182 |
{
|
183 |
"Model": "berkeley-nest/Starling-LM-7B-alpha",
|
184 |
"Params": "7.24B",
|
185 |
-
"Average": 2.945897435897436,
|
186 |
"Sentiment": 3.0576923076923075,
|
187 |
"Language understanding": 2.925,
|
188 |
"Phraseology": 2.855,
|
189 |
-
"Tricky questions":
|
190 |
},
|
191 |
{
|
192 |
"Model": "openchat/openchat-3.5-0106",
|
193 |
"Params": "7.24B",
|
194 |
-
"Average": 2.8500854700854696,
|
195 |
"Sentiment": 3.16025641025641,
|
196 |
"Language understanding": 2.835,
|
197 |
"Phraseology": 2.555,
|
198 |
-
"Tricky questions":
|
199 |
},
|
200 |
{
|
201 |
"Model": "internlm/internlm2-chat-20b",
|
202 |
"Params": "19.9B",
|
203 |
-
"Average": 2.8237606837606837,
|
204 |
"Sentiment": 3.301282051282051,
|
205 |
"Language understanding": 2.785,
|
206 |
"Phraseology": 2.385,
|
207 |
-
"Tricky questions":
|
208 |
},
|
209 |
{
|
210 |
"Model": "01-ai/Yi-1.5-34B-Chat",
|
211 |
"Params": "34.4B",
|
212 |
-
"Average": 2.7756410256410255,
|
213 |
"Sentiment": 3.076923076923077,
|
214 |
"Language understanding": 2.87,
|
215 |
"Phraseology": 2.38,
|
216 |
-
"Tricky questions":
|
217 |
},
|
218 |
{
|
219 |
"Model": "Voicelab/trurl-2-13b-academic",
|
220 |
"Params": "13B",
|
221 |
-
"Average": 2.74042735042735,
|
222 |
"Sentiment": 3.301282051282051,
|
223 |
"Language understanding": 2.755,
|
224 |
"Phraseology": 2.165,
|
225 |
-
"Tricky questions":
|
226 |
},
|
227 |
{
|
228 |
"Model": "google/gemma-2-2b-it",
|
229 |
"Params": "2.61B",
|
230 |
-
"Average": 2.7974786324786325,
|
231 |
"Sentiment": 3.3974358974359,
|
232 |
"Language understanding": 2.9,
|
233 |
"Phraseology": 2.095,
|
234 |
-
"Tricky questions":
|
235 |
},
|
236 |
{
|
237 |
"Model": "Qwen/Qwen2.5-3B-Instruct",
|
238 |
"Params": "3.09B",
|
239 |
-
"Average": 2.734572649572649,
|
240 |
"Sentiment": 2.948717948717949,
|
241 |
"Language understanding": 2.455,
|
242 |
"Phraseology": 2.8,
|
243 |
-
"Tricky questions":
|
244 |
},
|
245 |
{
|
246 |
"Model": "NousResearch/Hermes-3-Llama-3.2-3B",
|
247 |
"Params": "3.21B",
|
248 |
-
"Average": 2.695128205128205,
|
249 |
"Sentiment": 2.6153846153846154,
|
250 |
"Language understanding": 2.705,
|
251 |
"Phraseology": 2.765,
|
252 |
-
"Tricky questions":
|
253 |
},
|
254 |
{
|
255 |
"Model": "ibm-granite/granite-3.1-2b-instruct",
|
256 |
"Params": "2.53B",
|
257 |
-
"Average": 2.397307692307692,
|
258 |
"Sentiment": 3.076923076923077,
|
259 |
"Language understanding": 2.235,
|
260 |
"Phraseology": 1.88,
|
261 |
-
"Tricky questions":
|
262 |
},
|
263 |
{
|
264 |
"Model": "meta-llama/Llama-3.2-1B-Instruct",
|
265 |
"Params": "1.24B",
|
266 |
-
"Average": 2.383974358974359,
|
267 |
"Sentiment": 3.076923076923077,
|
268 |
"Language understanding": 1.735,
|
269 |
"Phraseology": 2.34,
|
270 |
-
"Tricky questions":
|
271 |
},
|
272 |
{
|
273 |
"Model": "microsoft/Phi-3.5-mini-instruct",
|
274 |
"Params": "3.82B",
|
275 |
-
"Average": 2.331965811965812,
|
276 |
"Sentiment": 2.435897435897436,
|
277 |
"Language understanding": 2.135,
|
278 |
"Phraseology": 2.425,
|
279 |
-
"Tricky questions":
|
280 |
},
|
281 |
{
|
282 |
"Model": "meta-llama/Llama-3.2-3B-Instruct",
|
283 |
"Params": "3.21B",
|
284 |
-
"Average": 2.257136752136752,
|
285 |
"Sentiment": 2.7564102564102564,
|
286 |
"Language understanding": 2.295,
|
287 |
"Phraseology": 1.72,
|
288 |
-
"Tricky questions":
|
289 |
},
|
290 |
{
|
291 |
"Model": "h2oai/h2o-danube2-1.8b-chat",
|
292 |
"Params": "1.83B",
|
293 |
-
"Average": 2.1455982905982904,
|
294 |
"Sentiment": 2.371794871794872,
|
295 |
"Language understanding": 1.595,
|
296 |
"Phraseology": 2.47,
|
297 |
-
"Tricky questions":
|
298 |
},
|
299 |
{
|
300 |
"Model": "Qwen/Qwen2.5-1.5B-Instruct",
|
301 |
"Params": "1.54B",
|
302 |
-
"Average": 2.1232905982905983,
|
303 |
"Sentiment": 2.7948717948717947,
|
304 |
"Language understanding": 1.35,
|
305 |
"Phraseology": 2.225,
|
306 |
-
"Tricky questions":
|
307 |
},
|
308 |
{
|
309 |
"Model": "utter-project/EuroLLM-1.7B-Instruct",
|
310 |
"Params": "1.66B",
|
311 |
-
"Average": 2.097863247863248,
|
312 |
"Sentiment": 2.243589743589744,
|
313 |
"Language understanding": 1.79,
|
314 |
"Phraseology": 2.26,
|
315 |
-
"Tricky questions":
|
316 |
},
|
317 |
{
|
318 |
"Model": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
|
319 |
"Params": "2.41B",
|
320 |
-
"Average": 2.062846282695529,
|
321 |
"Sentiment": 1.9423076923076923,
|
322 |
"Language understanding": 2.1155778894472363,
|
323 |
"Phraseology": 2.130653266331658,
|
324 |
-
"Tricky questions":
|
325 |
},
|
326 |
{
|
327 |
"Model": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
328 |
"Params": "1.71B",
|
329 |
-
"Average": 1.9102136752136751,
|
330 |
"Sentiment": 2.275641025641025,
|
331 |
"Language understanding": 1.1,
|
332 |
"Phraseology": 2.355,
|
333 |
-
"Tricky questions":
|
334 |
},
|
335 |
{
|
336 |
"Model": "Qwen/Qwen2.5-0.5B-Instruct",
|
337 |
"Params": "0.49B",
|
338 |
-
"Average": 1.7950427350427354,
|
339 |
"Sentiment": 1.955128205128205,
|
340 |
"Language understanding": 0.835,
|
341 |
"Phraseology": 2.595,
|
342 |
-
"Tricky questions":
|
343 |
},
|
344 |
{
|
345 |
"Model": "CYFRAGOVPL/Llama-PLLuM-70B-chat",
|
346 |
"Params": "70.6B",
|
347 |
-
"Average": 3.63,
|
348 |
"Sentiment": 3.94,
|
349 |
"Language understanding": 3.61,
|
350 |
"Phraseology": 3.35,
|
351 |
-
"Tricky questions":
|
352 |
},
|
353 |
{
|
354 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-nc-instruct",
|
355 |
"Params": "46.7B",
|
356 |
-
"Average": 3.56,
|
357 |
"Sentiment": 3.88,
|
358 |
"Language understanding": 3.59,
|
359 |
"Phraseology": 3.22,
|
360 |
-
"Tricky questions":
|
361 |
},
|
362 |
{
|
363 |
"Model": "CYFRAGOVPL/Llama-PLLuM-70B-instruct",
|
364 |
"Params": "70.6B",
|
365 |
-
"Average": 3.56,
|
366 |
"Sentiment": 3.78,
|
367 |
"Language understanding": 3.63,
|
368 |
"Phraseology": 3.26,
|
369 |
-
"Tricky questions":
|
370 |
},
|
371 |
{
|
372 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-instruct",
|
373 |
"Params": "46.7B",
|
374 |
-
"Average": 3.5,
|
375 |
"Sentiment": 3.59,
|
376 |
"Language understanding": 3.47,
|
377 |
"Phraseology": 3.46,
|
378 |
-
"Tricky questions":
|
379 |
},
|
380 |
{
|
381 |
"Model": "CYFRAGOVPL/PLLuM-12B-instruct",
|
382 |
"Params": "12.2B",
|
383 |
-
"Average": 3.49,
|
384 |
"Sentiment": 3.71,
|
385 |
"Language understanding": 3.17,
|
386 |
"Phraseology": 3.59,
|
387 |
-
"Tricky questions":
|
388 |
},
|
389 |
{
|
390 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-nc-chat",
|
391 |
"Params": "46.7B",
|
392 |
-
"Average": 3.44,
|
393 |
"Sentiment": 3.76,
|
394 |
"Language understanding": 3.48,
|
395 |
"Phraseology": 3.08,
|
396 |
-
"Tricky questions":
|
397 |
},
|
398 |
{
|
399 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-chat",
|
400 |
"Params": "46.7B",
|
401 |
-
"Average": 3.41,
|
402 |
"Sentiment": 3.44,
|
403 |
"Language understanding": 3.45,
|
404 |
"Phraseology": 3.35,
|
405 |
-
"Tricky questions":
|
406 |
},
|
407 |
{
|
408 |
"Model": "CYFRAGOVPL/PLLuM-12B-chat",
|
409 |
"Params": "12.2B",
|
410 |
-
"Average": 3.32,
|
411 |
"Sentiment": 3.32,
|
412 |
"Language understanding": 3.21,
|
413 |
"Phraseology": 3.43,
|
414 |
-
"Tricky questions":
|
415 |
},
|
416 |
{
|
417 |
"Model": "CYFRAGOVPL/PLLuM-12B-nc-instruct",
|
418 |
"Params": "12.2B",
|
419 |
-
"Average": 3.29,
|
420 |
"Sentiment": 3.24,
|
421 |
"Language understanding": 3.31,
|
422 |
"Phraseology": 3.32,
|
423 |
-
"Tricky questions":
|
424 |
},
|
425 |
{
|
426 |
"Model": "CYFRAGOVPL/Llama-PLLuM-8B-instruct",
|
427 |
"Params": "8.03B",
|
428 |
-
"Average": 3.2,
|
429 |
"Sentiment": 3.24,
|
430 |
-
"Language understanding": 2.
|
431 |
"Phraseology": 3.46,
|
432 |
-
"Tricky questions":
|
433 |
},
|
434 |
{
|
435 |
"Model": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
|
436 |
"Params": "8.03B",
|
437 |
-
"Average": 3.14,
|
438 |
"Sentiment": 3.13,
|
439 |
"Language understanding": 2.93,
|
440 |
"Phraseology": 3.36,
|
441 |
-
"Tricky questions":
|
442 |
},
|
443 |
{
|
444 |
"Model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
|
445 |
"Params": "12.2B",
|
446 |
-
"Average": 3.33,
|
447 |
"Sentiment": 3.22,
|
448 |
"Language understanding": 3.23,
|
449 |
"Phraseology": 3.54,
|
450 |
-
"Tricky questions":
|
451 |
},
|
452 |
{
|
453 |
"Model": "Qwen/Qwen2.5-72B-Instruct",
|
454 |
"Params": "72.7B",
|
455 |
-
"Average": 3.9923076923076923,
|
456 |
"Sentiment": 4.076923076923077,
|
457 |
"Language understanding": 3.97,
|
458 |
"Phraseology": 3.93,
|
459 |
-
"Tricky questions":
|
460 |
},
|
461 |
{
|
462 |
"Model": "Qwen/Qwen2.5-32B-Instruct",
|
463 |
"Params": "32.8B",
|
464 |
-
"Average": 3.8047008547008545,
|
465 |
"Sentiment": 3.8141025641025643,
|
466 |
"Language understanding": 3.565,
|
467 |
"Phraseology": 4.035,
|
468 |
-
"Tricky questions":
|
469 |
},
|
470 |
{
|
471 |
"Model": "mistralai/Mistral-Small-24B-Instruct-2501",
|
472 |
"Params": "23.6B",
|
473 |
-
"Average": 3.79508547008547,
|
474 |
"Sentiment": 3.91025641025641,
|
475 |
"Language understanding": 3.6,
|
476 |
"Phraseology": 3.875,
|
477 |
-
"Tricky questions":
|
478 |
},
|
479 |
{
|
480 |
"Model": "meta-llama/Llama-3.3-70B-Instruct",
|
481 |
"Params": "70.6B",
|
482 |
-
"Average": 3.7332905982905977,
|
483 |
"Sentiment": 4.294871794871795,
|
484 |
"Language understanding": 3.865,
|
485 |
"Phraseology": 3.04,
|
486 |
-
"Tricky questions":
|
487 |
},
|
488 |
{
|
489 |
"Model": "Qwen/Qwen2.5-14B-Instruct",
|
490 |
"Params": "14.8B",
|
491 |
-
"Average": 3.61508547008547,
|
492 |
"Sentiment": 3.91025641025641,
|
493 |
"Language understanding": 3.565,
|
494 |
"Phraseology": 3.37,
|
495 |
-
"Tricky questions":
|
496 |
},
|
497 |
{
|
498 |
"Model": "microsoft/phi-4",
|
499 |
"Params": "14.7B",
|
500 |
-
"Average": 3.4976495726495727,
|
501 |
"Sentiment": 3.717948717948718,
|
502 |
"Language understanding": 3.54,
|
503 |
"Phraseology": 3.235,
|
504 |
-
"Tricky questions":
|
505 |
},
|
506 |
{
|
507 |
"Model": "Qwen/Qwen2.5-7B-Instruct",
|
508 |
"Params": "7.62B",
|
509 |
-
"Average": 3.2258974358974357,
|
510 |
"Sentiment": 3.5576923076923075,
|
511 |
"Language understanding": 3.025,
|
512 |
"Phraseology": 3.095,
|
513 |
-
"Tricky questions":
|
514 |
},
|
515 |
{
|
516 |
"Model": "microsoft/Phi-4-mini-instruct",
|
517 |
"Params": "3.84B",
|
518 |
-
"Average": 2.455769230769231,
|
519 |
"Sentiment": 2.6923076923076925,
|
520 |
"Language understanding": 2.43,
|
521 |
"Phraseology": 2.245,
|
522 |
-
"Tricky questions":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
}
|
524 |
-
]
|
|
|
2 |
{
|
3 |
"Model": "mistralai/Mistral-Large-Instruct-2407",
|
4 |
"Params": "123B",
|
|
|
5 |
"Sentiment": 4.230769230769231,
|
6 |
"Language understanding": 4.0,
|
7 |
"Phraseology": 3.86,
|
8 |
+
"Tricky questions": 3.9
|
9 |
},
|
10 |
{
|
11 |
"Model": "alpindale/WizardLM-2-8x22B",
|
12 |
"Params": "141B",
|
|
|
13 |
"Sentiment": 3.7051282051282053,
|
14 |
"Language understanding": 3.815,
|
15 |
"Phraseology": 4.22,
|
16 |
+
"Tricky questions": 3.9
|
17 |
},
|
18 |
{
|
19 |
"Model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
20 |
"Params": "70.6B",
|
|
|
21 |
"Sentiment": 4.326923076923077,
|
22 |
"Language understanding": 3.91,
|
23 |
"Phraseology": 3.25,
|
24 |
+
"Tricky questions": 3.9
|
25 |
},
|
26 |
{
|
27 |
"Model": "meta-llama/Meta-Llama-3-70B-Instruct",
|
28 |
"Params": "70.6B",
|
|
|
29 |
"Sentiment": 4.134615384615385,
|
30 |
"Language understanding": 3.82,
|
31 |
"Phraseology": 3.465,
|
32 |
+
"Tricky questions": 3.9
|
33 |
},
|
34 |
{
|
35 |
"Model": "speakleash/Bielik-11B-v2.3-Instruct",
|
36 |
"Params": "11.2B",
|
|
|
37 |
"Sentiment": 3.9743589743589745,
|
38 |
"Language understanding": 3.785,
|
39 |
"Phraseology": 3.55,
|
40 |
+
"Tricky questions": 3.9
|
41 |
},
|
42 |
{
|
43 |
"Model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
|
44 |
"Params": "141B",
|
|
|
45 |
"Sentiment": 3.782051282051282,
|
46 |
"Language understanding": 3.675,
|
47 |
"Phraseology": 3.55,
|
48 |
+
"Tricky questions": 3.9
|
49 |
},
|
50 |
{
|
51 |
"Model": "speakleash/Bielik-11B-v2.1-Instruct",
|
52 |
"Params": "11.2B",
|
|
|
53 |
"Sentiment": 3.9551282051282053,
|
54 |
"Language understanding": 3.915,
|
55 |
"Phraseology": 3.105,
|
56 |
+
"Tricky questions": 3.9
|
57 |
},
|
58 |
{
|
59 |
"Model": "Qwen/Qwen2-72B-Instruct",
|
60 |
"Params": "72.7B",
|
|
|
61 |
"Sentiment": 3.7628205128205128,
|
62 |
"Language understanding": 3.89,
|
63 |
"Phraseology": 3.28,
|
64 |
+
"Tricky questions": 3.9
|
65 |
},
|
66 |
{
|
67 |
"Model": "speakleash/Bielik-11B-v2.0-Instruct",
|
68 |
"Params": "11.2B",
|
|
|
69 |
"Sentiment": 3.9743589743589745,
|
70 |
"Language understanding": 3.745,
|
71 |
"Phraseology": 3.125,
|
72 |
+
"Tricky questions": 3.9
|
73 |
},
|
74 |
{
|
75 |
"Model": "speakleash/Bielik-11B-v2.2-Instruct",
|
76 |
"Params": "11.2B",
|
|
|
77 |
"Sentiment": 3.717948717948718,
|
78 |
"Language understanding": 3.73,
|
79 |
"Phraseology": 3.25,
|
80 |
+
"Tricky questions": 3.9
|
81 |
},
|
82 |
{
|
83 |
"Model": "Qwen/Qwen1.5-72B-Chat",
|
84 |
"Params": "72.3B",
|
|
|
85 |
"Sentiment": 3.4743589743589745,
|
86 |
"Language understanding": 3.515,
|
87 |
"Phraseology": 2.975,
|
88 |
+
"Tricky questions": 3.9
|
89 |
},
|
90 |
{
|
91 |
"Model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
92 |
"Params": "8.03B",
|
|
|
93 |
"Sentiment": 3.9743589743589745,
|
94 |
"Language understanding": 3.38,
|
95 |
"Phraseology": 2.58,
|
96 |
+
"Tricky questions": 3.9
|
97 |
},
|
98 |
{
|
99 |
"Model": "THUDM/glm-4-9b-chat",
|
100 |
"Params": "9.4B",
|
|
|
101 |
"Sentiment": 3.58974358974359,
|
102 |
"Language understanding": 3.455,
|
103 |
"Phraseology": 2.78,
|
104 |
+
"Tricky questions": 3.9
|
105 |
},
|
106 |
{
|
107 |
"Model": "mistralai/Mistral-Nemo-Instruct-2407",
|
108 |
"Params": "12.2B",
|
|
|
109 |
"Sentiment": 3.641025641025641,
|
110 |
"Language understanding": 3.29,
|
111 |
"Phraseology": 2.74,
|
112 |
+
"Tricky questions": 3.9
|
113 |
},
|
114 |
{
|
115 |
"Model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
116 |
"Params": "8.03B",
|
|
|
117 |
"Sentiment": 3.3333333333333335,
|
118 |
"Language understanding": 3.15,
|
119 |
"Phraseology": 3.035,
|
120 |
+
"Tricky questions": 3.9
|
121 |
},
|
122 |
{
|
123 |
"Model": "upstage/SOLAR-10.7B-Instruct-v1.0",
|
124 |
"Params": "10.7B",
|
|
|
125 |
"Sentiment": 2.967948717948718,
|
126 |
"Language understanding": 3.18,
|
127 |
"Phraseology": 3.255,
|
128 |
+
"Tricky questions": 3.9
|
129 |
},
|
130 |
{
|
131 |
"Model": "speakleash/Bielik-7B-Instruct-v0.1",
|
132 |
"Params": "7.24B",
|
|
|
133 |
"Sentiment": 3.58974358974359,
|
134 |
"Language understanding": 3.475,
|
135 |
"Phraseology": 2.315,
|
136 |
+
"Tricky questions": 3.9
|
137 |
},
|
138 |
{
|
139 |
"Model": "openchat/openchat-3.5-0106-gemma",
|
140 |
"Params": "8.54B",
|
|
|
141 |
"Sentiment": 3.730769230769231,
|
142 |
"Language understanding": 3.08,
|
143 |
"Phraseology": 2.445,
|
144 |
+
"Tricky questions": 3.9
|
145 |
},
|
146 |
{
|
147 |
"Model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
148 |
"Params": "46.7B",
|
|
|
149 |
"Sentiment": 3.0576923076923075,
|
150 |
"Language understanding": 3.175,
|
151 |
"Phraseology": 2.885,
|
152 |
+
"Tricky questions": 3.9
|
153 |
},
|
154 |
{
|
155 |
"Model": "mistralai/Mistral-7B-Instruct-v0.3",
|
156 |
"Params": "7.25B",
|
|
|
157 |
"Sentiment": 3.326923076923077,
|
158 |
"Language understanding": 3.06,
|
159 |
"Phraseology": 2.68,
|
160 |
+
"Tricky questions": 3.9
|
161 |
},
|
162 |
{
|
163 |
"Model": "berkeley-nest/Starling-LM-7B-alpha",
|
164 |
"Params": "7.24B",
|
|
|
165 |
"Sentiment": 3.0576923076923075,
|
166 |
"Language understanding": 2.925,
|
167 |
"Phraseology": 2.855,
|
168 |
+
"Tricky questions": 3.9
|
169 |
},
|
170 |
{
|
171 |
"Model": "openchat/openchat-3.5-0106",
|
172 |
"Params": "7.24B",
|
|
|
173 |
"Sentiment": 3.16025641025641,
|
174 |
"Language understanding": 2.835,
|
175 |
"Phraseology": 2.555,
|
176 |
+
"Tricky questions": 3.9
|
177 |
},
|
178 |
{
|
179 |
"Model": "internlm/internlm2-chat-20b",
|
180 |
"Params": "19.9B",
|
|
|
181 |
"Sentiment": 3.301282051282051,
|
182 |
"Language understanding": 2.785,
|
183 |
"Phraseology": 2.385,
|
184 |
+
"Tricky questions": 3.9
|
185 |
},
|
186 |
{
|
187 |
"Model": "01-ai/Yi-1.5-34B-Chat",
|
188 |
"Params": "34.4B",
|
|
|
189 |
"Sentiment": 3.076923076923077,
|
190 |
"Language understanding": 2.87,
|
191 |
"Phraseology": 2.38,
|
192 |
+
"Tricky questions": 3.9
|
193 |
},
|
194 |
{
|
195 |
"Model": "Voicelab/trurl-2-13b-academic",
|
196 |
"Params": "13B",
|
|
|
197 |
"Sentiment": 3.301282051282051,
|
198 |
"Language understanding": 2.755,
|
199 |
"Phraseology": 2.165,
|
200 |
+
"Tricky questions": 3.9
|
201 |
},
|
202 |
{
|
203 |
"Model": "google/gemma-2-2b-it",
|
204 |
"Params": "2.61B",
|
|
|
205 |
"Sentiment": 3.3974358974359,
|
206 |
"Language understanding": 2.9,
|
207 |
"Phraseology": 2.095,
|
208 |
+
"Tricky questions": 3.9
|
209 |
},
|
210 |
{
|
211 |
"Model": "Qwen/Qwen2.5-3B-Instruct",
|
212 |
"Params": "3.09B",
|
|
|
213 |
"Sentiment": 2.948717948717949,
|
214 |
"Language understanding": 2.455,
|
215 |
"Phraseology": 2.8,
|
216 |
+
"Tricky questions": 3.9
|
217 |
},
|
218 |
{
|
219 |
"Model": "NousResearch/Hermes-3-Llama-3.2-3B",
|
220 |
"Params": "3.21B",
|
|
|
221 |
"Sentiment": 2.6153846153846154,
|
222 |
"Language understanding": 2.705,
|
223 |
"Phraseology": 2.765,
|
224 |
+
"Tricky questions": 3.9
|
225 |
},
|
226 |
{
|
227 |
"Model": "ibm-granite/granite-3.1-2b-instruct",
|
228 |
"Params": "2.53B",
|
|
|
229 |
"Sentiment": 3.076923076923077,
|
230 |
"Language understanding": 2.235,
|
231 |
"Phraseology": 1.88,
|
232 |
+
"Tricky questions": 3.9
|
233 |
},
|
234 |
{
|
235 |
"Model": "meta-llama/Llama-3.2-1B-Instruct",
|
236 |
"Params": "1.24B",
|
|
|
237 |
"Sentiment": 3.076923076923077,
|
238 |
"Language understanding": 1.735,
|
239 |
"Phraseology": 2.34,
|
240 |
+
"Tricky questions": 3.9
|
241 |
},
|
242 |
{
|
243 |
"Model": "microsoft/Phi-3.5-mini-instruct",
|
244 |
"Params": "3.82B",
|
|
|
245 |
"Sentiment": 2.435897435897436,
|
246 |
"Language understanding": 2.135,
|
247 |
"Phraseology": 2.425,
|
248 |
+
"Tricky questions": 3.9
|
249 |
},
|
250 |
{
|
251 |
"Model": "meta-llama/Llama-3.2-3B-Instruct",
|
252 |
"Params": "3.21B",
|
|
|
253 |
"Sentiment": 2.7564102564102564,
|
254 |
"Language understanding": 2.295,
|
255 |
"Phraseology": 1.72,
|
256 |
+
"Tricky questions": 3.9
|
257 |
},
|
258 |
{
|
259 |
"Model": "h2oai/h2o-danube2-1.8b-chat",
|
260 |
"Params": "1.83B",
|
|
|
261 |
"Sentiment": 2.371794871794872,
|
262 |
"Language understanding": 1.595,
|
263 |
"Phraseology": 2.47,
|
264 |
+
"Tricky questions": 3.9
|
265 |
},
|
266 |
{
|
267 |
"Model": "Qwen/Qwen2.5-1.5B-Instruct",
|
268 |
"Params": "1.54B",
|
|
|
269 |
"Sentiment": 2.7948717948717947,
|
270 |
"Language understanding": 1.35,
|
271 |
"Phraseology": 2.225,
|
272 |
+
"Tricky questions": 3.9
|
273 |
},
|
274 |
{
|
275 |
"Model": "utter-project/EuroLLM-1.7B-Instruct",
|
276 |
"Params": "1.66B",
|
|
|
277 |
"Sentiment": 2.243589743589744,
|
278 |
"Language understanding": 1.79,
|
279 |
"Phraseology": 2.26,
|
280 |
+
"Tricky questions": 3.9
|
281 |
},
|
282 |
{
|
283 |
"Model": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
|
284 |
"Params": "2.41B",
|
|
|
285 |
"Sentiment": 1.9423076923076923,
|
286 |
"Language understanding": 2.1155778894472363,
|
287 |
"Phraseology": 2.130653266331658,
|
288 |
+
"Tricky questions": 3.9
|
289 |
},
|
290 |
{
|
291 |
"Model": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
292 |
"Params": "1.71B",
|
|
|
293 |
"Sentiment": 2.275641025641025,
|
294 |
"Language understanding": 1.1,
|
295 |
"Phraseology": 2.355,
|
296 |
+
"Tricky questions": 3.9
|
297 |
},
|
298 |
{
|
299 |
"Model": "Qwen/Qwen2.5-0.5B-Instruct",
|
300 |
"Params": "0.49B",
|
|
|
301 |
"Sentiment": 1.955128205128205,
|
302 |
"Language understanding": 0.835,
|
303 |
"Phraseology": 2.595,
|
304 |
+
"Tricky questions": 3.9
|
305 |
},
|
306 |
{
|
307 |
"Model": "CYFRAGOVPL/Llama-PLLuM-70B-chat",
|
308 |
"Params": "70.6B",
|
|
|
309 |
"Sentiment": 3.94,
|
310 |
"Language understanding": 3.61,
|
311 |
"Phraseology": 3.35,
|
312 |
+
"Tricky questions": 3.9
|
313 |
},
|
314 |
{
|
315 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-nc-instruct",
|
316 |
"Params": "46.7B",
|
|
|
317 |
"Sentiment": 3.88,
|
318 |
"Language understanding": 3.59,
|
319 |
"Phraseology": 3.22,
|
320 |
+
"Tricky questions": 3.9
|
321 |
},
|
322 |
{
|
323 |
"Model": "CYFRAGOVPL/Llama-PLLuM-70B-instruct",
|
324 |
"Params": "70.6B",
|
|
|
325 |
"Sentiment": 3.78,
|
326 |
"Language understanding": 3.63,
|
327 |
"Phraseology": 3.26,
|
328 |
+
"Tricky questions": 3.9
|
329 |
},
|
330 |
{
|
331 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-instruct",
|
332 |
"Params": "46.7B",
|
|
|
333 |
"Sentiment": 3.59,
|
334 |
"Language understanding": 3.47,
|
335 |
"Phraseology": 3.46,
|
336 |
+
"Tricky questions": 3.9
|
337 |
},
|
338 |
{
|
339 |
"Model": "CYFRAGOVPL/PLLuM-12B-instruct",
|
340 |
"Params": "12.2B",
|
|
|
341 |
"Sentiment": 3.71,
|
342 |
"Language understanding": 3.17,
|
343 |
"Phraseology": 3.59,
|
344 |
+
"Tricky questions": 3.9
|
345 |
},
|
346 |
{
|
347 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-nc-chat",
|
348 |
"Params": "46.7B",
|
|
|
349 |
"Sentiment": 3.76,
|
350 |
"Language understanding": 3.48,
|
351 |
"Phraseology": 3.08,
|
352 |
+
"Tricky questions": 3.9
|
353 |
},
|
354 |
{
|
355 |
"Model": "CYFRAGOVPL/PLLuM-8x7B-chat",
|
356 |
"Params": "46.7B",
|
|
|
357 |
"Sentiment": 3.44,
|
358 |
"Language understanding": 3.45,
|
359 |
"Phraseology": 3.35,
|
360 |
+
"Tricky questions": 3.9
|
361 |
},
|
362 |
{
|
363 |
"Model": "CYFRAGOVPL/PLLuM-12B-chat",
|
364 |
"Params": "12.2B",
|
|
|
365 |
"Sentiment": 3.32,
|
366 |
"Language understanding": 3.21,
|
367 |
"Phraseology": 3.43,
|
368 |
+
"Tricky questions": 3.9
|
369 |
},
|
370 |
{
|
371 |
"Model": "CYFRAGOVPL/PLLuM-12B-nc-instruct",
|
372 |
"Params": "12.2B",
|
|
|
373 |
"Sentiment": 3.24,
|
374 |
"Language understanding": 3.31,
|
375 |
"Phraseology": 3.32,
|
376 |
+
"Tricky questions": 3.9
|
377 |
},
|
378 |
{
|
379 |
"Model": "CYFRAGOVPL/Llama-PLLuM-8B-instruct",
|
380 |
"Params": "8.03B",
|
|
|
381 |
"Sentiment": 3.24,
|
382 |
+
"Language understanding": 2.90,
|
383 |
"Phraseology": 3.46,
|
384 |
+
"Tricky questions": 3.9
|
385 |
},
|
386 |
{
|
387 |
"Model": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
|
388 |
"Params": "8.03B",
|
|
|
389 |
"Sentiment": 3.13,
|
390 |
"Language understanding": 2.93,
|
391 |
"Phraseology": 3.36,
|
392 |
+
"Tricky questions": 3.9
|
393 |
},
|
394 |
{
|
395 |
"Model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
|
396 |
"Params": "12.2B",
|
|
|
397 |
"Sentiment": 3.22,
|
398 |
"Language understanding": 3.23,
|
399 |
"Phraseology": 3.54,
|
400 |
+
"Tricky questions": 3.9
|
401 |
},
|
402 |
{
|
403 |
"Model": "Qwen/Qwen2.5-72B-Instruct",
|
404 |
"Params": "72.7B",
|
|
|
405 |
"Sentiment": 4.076923076923077,
|
406 |
"Language understanding": 3.97,
|
407 |
"Phraseology": 3.93,
|
408 |
+
"Tricky questions": 3.9
|
409 |
},
|
410 |
{
|
411 |
"Model": "Qwen/Qwen2.5-32B-Instruct",
|
412 |
"Params": "32.8B",
|
|
|
413 |
"Sentiment": 3.8141025641025643,
|
414 |
"Language understanding": 3.565,
|
415 |
"Phraseology": 4.035,
|
416 |
+
"Tricky questions": 3.9
|
417 |
},
|
418 |
{
|
419 |
"Model": "mistralai/Mistral-Small-24B-Instruct-2501",
|
420 |
"Params": "23.6B",
|
|
|
421 |
"Sentiment": 3.91025641025641,
|
422 |
"Language understanding": 3.6,
|
423 |
"Phraseology": 3.875,
|
424 |
+
"Tricky questions": 3.9
|
425 |
},
|
426 |
{
|
427 |
"Model": "meta-llama/Llama-3.3-70B-Instruct",
|
428 |
"Params": "70.6B",
|
|
|
429 |
"Sentiment": 4.294871794871795,
|
430 |
"Language understanding": 3.865,
|
431 |
"Phraseology": 3.04,
|
432 |
+
"Tricky questions": 3.9
|
433 |
},
|
434 |
{
|
435 |
"Model": "Qwen/Qwen2.5-14B-Instruct",
|
436 |
"Params": "14.8B",
|
|
|
437 |
"Sentiment": 3.91025641025641,
|
438 |
"Language understanding": 3.565,
|
439 |
"Phraseology": 3.37,
|
440 |
+
"Tricky questions": 3.9
|
441 |
},
|
442 |
{
|
443 |
"Model": "microsoft/phi-4",
|
444 |
"Params": "14.7B",
|
|
|
445 |
"Sentiment": 3.717948717948718,
|
446 |
"Language understanding": 3.54,
|
447 |
"Phraseology": 3.235,
|
448 |
+
"Tricky questions": 3.9
|
449 |
},
|
450 |
{
|
451 |
"Model": "Qwen/Qwen2.5-7B-Instruct",
|
452 |
"Params": "7.62B",
|
|
|
453 |
"Sentiment": 3.5576923076923075,
|
454 |
"Language understanding": 3.025,
|
455 |
"Phraseology": 3.095,
|
456 |
+
"Tricky questions": 3.9
|
457 |
},
|
458 |
{
|
459 |
"Model": "microsoft/Phi-4-mini-instruct",
|
460 |
"Params": "3.84B",
|
|
|
461 |
"Sentiment": 2.6923076923076925,
|
462 |
"Language understanding": 2.43,
|
463 |
"Phraseology": 2.245,
|
464 |
+
"Tricky questions": 3.9
|
465 |
+
},
|
466 |
+
{
|
467 |
+
"Model": "gemini-2.0-flash-001",
|
468 |
+
"Params": "",
|
469 |
+
"Sentiment": 4.519230769230769,
|
470 |
+
"Language understanding": 4.32,
|
471 |
+
"Phraseology": 4.34,
|
472 |
+
"Tricky questions": 3.9
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"Model": "gemini-2.0-flash-lite-001",
|
476 |
+
"Params": "",
|
477 |
+
"Sentiment": 4.230769230769231,
|
478 |
+
"Language understanding": 4.055,
|
479 |
+
"Phraseology": 4.235,
|
480 |
+
"Tricky questions": 3.9
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"Model": "deepseek-ai/DeepSeek-V3 (API)",
|
484 |
+
"Params": "685B",
|
485 |
+
"Sentiment": 4.358974358974359,
|
486 |
+
"Language understanding": 4.22,
|
487 |
+
"Phraseology": 3.525,
|
488 |
+
"Tricky questions": 3.9
|
489 |
+
},
|
490 |
+
{
|
491 |
+
"Model": "google/gemma-3-27b-it (API)",
|
492 |
+
"Params": "27.4B",
|
493 |
+
"Sentiment": 3.878205128205128,
|
494 |
+
"Language understanding": 3.785,
|
495 |
+
"Phraseology": 4.025,
|
496 |
+
"Tricky questions": 3.9
|
497 |
}
|
498 |
+
]
|