koderfpv commited on
Commit
95c0f54
·
1 Parent(s): 48ab5dd

implicatures and tricky questions

Browse files
Files changed (2) hide show
  1. app.py +120 -95
  2. data.json +92 -118
app.py CHANGED
@@ -7,6 +7,7 @@ import plotly.express as px
7
  from st_social_media_links import SocialMediaIcons
8
 
9
 
 
10
  RESULTS_COLUMN_NAME = "Results"
11
  AVERAGE_COLUMN_NAME = "Average"
12
  SENTIMENT_COLUMN_NAME = "Sentiment"
@@ -16,76 +17,60 @@ TRICKY_QUESTIONS_COLUMN_NAME = "Tricky questions"
16
  IMPLICATURES_AVERAGE_COLUMN_NAME = "Implicatures average"
17
 
18
  # Function to load data from JSON file
 
 
19
  @st.cache_data
20
  def load_data(file_path):
21
  with open(file_path, 'r', encoding='utf-8') as file:
22
  data = json.load(file)
23
- return pd.DataFrame(data)
 
 
 
 
 
 
24
 
25
  # Function to style the DataFrame
 
 
26
  @st.cache_data
27
  def style_dataframe(df: pd.DataFrame):
28
- # Calculate Implicatures average from the three columns
29
- df[IMPLICATURES_AVERAGE_COLUMN_NAME] = df.apply(
30
- lambda row: (row[SENTIMENT_COLUMN_NAME] + row[UNDERSTANDING_COLUMN_NAME] + row[PHRASEOLOGY_COLUMN_NAME]) / 3,
31
- axis=1
32
- )
33
-
34
- # Calculate Average from all four columns
35
- df[AVERAGE_COLUMN_NAME] = df.apply(
36
- lambda row: (row[SENTIMENT_COLUMN_NAME] + row[UNDERSTANDING_COLUMN_NAME] +
37
- row[PHRASEOLOGY_COLUMN_NAME] + row[TRICKY_QUESTIONS_COLUMN_NAME]) / 4,
38
- axis=1
39
- )
40
-
41
- df[RESULTS_COLUMN_NAME] = df.apply(
42
- lambda row: [row[SENTIMENT_COLUMN_NAME], row[UNDERSTANDING_COLUMN_NAME],
43
- row[PHRASEOLOGY_COLUMN_NAME], row[TRICKY_QUESTIONS_COLUMN_NAME]],
44
- axis=1
45
- )
46
-
47
- # Insert the new column after the 'Average' column
48
  cols = list(df.columns)
49
- avg_index = cols.index(AVERAGE_COLUMN_NAME)
50
-
51
- # Remove columns from their current positions if they exist
52
- if IMPLICATURES_AVERAGE_COLUMN_NAME in cols:
53
- cols.pop(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME))
54
- if RESULTS_COLUMN_NAME in cols:
55
- cols.pop(cols.index(RESULTS_COLUMN_NAME))
56
-
57
- # Insert columns in the desired order
58
- cols.insert(avg_index + 1, IMPLICATURES_AVERAGE_COLUMN_NAME)
59
- cols.insert(avg_index + 2, RESULTS_COLUMN_NAME)
60
-
 
61
  df = df[cols]
 
62
  return df
63
 
 
64
  def styler(df: pd.DataFrame):
65
  palette = sns.color_palette("RdYlGn", as_cmap=True)
66
  # Apply reverse color gradient to the "Params" column
67
- params_palette = sns.color_palette("RdYlGn_r", as_cmap=True) # Reversed RdYlGn palette
68
- styled_df = df.style.background_gradient(
69
- cmap=palette,
70
- subset=[AVERAGE_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
71
- PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
72
- ).background_gradient(
73
- cmap=params_palette, subset=["Params"]
74
- ).set_properties(
75
- **{'text-align': 'center'},
76
- subset=[AVERAGE_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
77
- PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
78
- ).format(
79
- "{:.2f}".center(10),
80
- subset=[AVERAGE_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
81
- PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
82
- ).format(
83
- "{:.1f}".center(10), subset=["Params"]
84
- )
85
  return styled_df
86
 
87
 
88
- ### Streamlit app
89
  st.set_page_config(layout="wide")
90
 
91
  st.markdown("""
@@ -100,7 +85,7 @@ st.markdown("""
100
  </style>
101
  """, unsafe_allow_html=True)
102
 
103
- ### Prepare layout
104
 
105
  st.markdown("""
106
  <style>
@@ -113,12 +98,10 @@ st.markdown("""
113
  .center-text {
114
  text-align: center;
115
  }
116
-
117
  a:link {color:#FDA428;} /* unvisited link */
118
  a:hover {color:#FDA428;} /* Mouse over link */
119
  a:visited {color:#FDA428;} /* visited link */
120
  a:active {color:#FDA428;} /* selected link */
121
-
122
  </style>
123
  """, unsafe_allow_html=True)
124
 
@@ -131,7 +114,7 @@ st.markdown("""
131
  # ----------------------------------------------------------
132
  st.markdown("""<br>""", unsafe_allow_html=True)
133
 
134
- ### Row: 1 --> Title + links to SpeakLeash.org website / GitHub / X (Twitter)
135
  social_media_links = [
136
  "https://discord.com/invite/ZJwCMrxwT7",
137
  "https://github.com/speakleash",
@@ -154,7 +137,8 @@ social_media_links_colors = [
154
  links_color
155
  ]
156
 
157
- social_media_icons = SocialMediaIcons(social_media_links, social_media_links_colors)
 
158
  social_media_icons.render(justify_content='right')
159
 
160
  st.markdown("""
@@ -184,46 +168,86 @@ with tab1:
184
 
185
  # Prepare data
186
  data = load_data('data.json')
187
-
188
- data['Params'] = data['Params'].str.replace('B', '').astype(float)
 
 
 
189
  data = data.sort_values(by=AVERAGE_COLUMN_NAME, ascending=False)
190
 
191
  # Closing filters in a expander
192
  with st.expander("Filtering benchmark data", icon='🔍'):
193
  # Filtering data, e.g. slider for params, average score, etc.
194
- col_filter_params, col_filter_average, col_filter_sentiment, col_filter_understanding, col_filter_phraseology, col_filter_tricky = st.columns(6, gap='medium')
195
-
 
196
  with col_filter_params:
197
- params_slider = st.slider("Models Size [B]", min_value=0.0, max_value=float(data['Params'].max()), value=(0.0, float(data['Params'].max())), step=0.1, format="%.1f")
198
- data = data[(data['Params'] >= params_slider[0]) & (data['Params'] <= params_slider[1])]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  with col_filter_average:
201
- average_slider = st.slider("Average score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
202
- data = data[(data[AVERAGE_COLUMN_NAME] >= average_slider[0]) & (data[AVERAGE_COLUMN_NAME] <= average_slider[1])]
 
 
 
 
 
 
 
 
203
 
204
  with col_filter_sentiment:
205
- sentiment_slider = st.slider("Sentiment score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
206
- data = data[(data[SENTIMENT_COLUMN_NAME] >= sentiment_slider[0]) & (data[SENTIMENT_COLUMN_NAME] <= sentiment_slider[1])]
 
 
207
 
208
  with col_filter_understanding:
209
- understanding_slider = st.slider("Understanding score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
210
- data = data[(data[UNDERSTANDING_COLUMN_NAME] >= understanding_slider[0]) & (data[UNDERSTANDING_COLUMN_NAME] <= understanding_slider[1])]
 
 
211
 
212
  with col_filter_phraseology:
213
- phraseology_slider = st.slider("Phraseology score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
214
- data = data[(data[PHRASEOLOGY_COLUMN_NAME] >= phraseology_slider[0]) & (data[PHRASEOLOGY_COLUMN_NAME] <= phraseology_slider[1])]
215
-
216
- with col_filter_tricky:
217
- tricky_slider = st.slider("Tricky questions score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
218
- data = data[(data[TRICKY_QUESTIONS_COLUMN_NAME] >= tricky_slider[0]) & (data[TRICKY_QUESTIONS_COLUMN_NAME] <= tricky_slider[1])]
 
 
 
 
219
 
220
  # Extract unique provider names from the "Model" column
221
- providers = data["Model"].apply(lambda x: x.split('/')[0].lower()).unique()
222
- selected_providers = st.multiselect("Model providers", providers, default=providers)
 
 
223
  # Filter data based on selected providers
224
- data = data[data["Model"].apply(lambda x: x.split('/')[0].lower()).isin(selected_providers)]
 
225
 
226
- ### Display data
227
  styled_df_show = style_dataframe(data)
228
  styled_df_show = styler(styled_df_show)
229
 
@@ -233,23 +257,26 @@ with tab1:
233
  AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(AVERAGE_COLUMN_NAME),
234
  RESULTS_COLUMN_NAME: st.column_config.BarChartColumn(
235
  "Bar chart of results", help="Summary of the results of each task",
236
- y_min=0,y_max=5,),
237
  SENTIMENT_COLUMN_NAME: st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment'),
238
  UNDERSTANDING_COLUMN_NAME: st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language'),
239
  PHRASEOLOGY_COLUMN_NAME: st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds'),
240
  TRICKY_QUESTIONS_COLUMN_NAME: st.column_config.NumberColumn(TRICKY_QUESTIONS_COLUMN_NAME, help='Ability to understand tricky questions'),
241
- IMPLICATURES_AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(IMPLICATURES_AVERAGE_COLUMN_NAME, help='Average of sentiment, understanding, and phraseology'),
242
- }, hide_index=True, disabled=True, height=500)
243
 
244
  # Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
245
  # Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
246
- default_models = list(data.sort_values(AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3))
247
- bielik_model = data[data['Model'].str.contains('Bielik')].sort_values(AVERAGE_COLUMN_NAME, ascending=False)['Model'].iloc[0]
 
 
248
  if bielik_model not in default_models:
249
  default_models.append(bielik_model)
250
- selected_models = st.multiselect("Select models to compare", data["Model"].unique(), default=default_models)
 
251
  selected_data = data[data["Model"].isin(selected_models)]
252
- categories = [AVERAGE_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
 
253
 
254
  if selected_models:
255
  # Kolorki do wyboru:
@@ -258,7 +285,8 @@ with tab1:
258
 
259
  fig_bars = go.Figure()
260
  for model, color in zip(selected_models, colors):
261
- values = selected_data[selected_data['Model'] == model][categories].values.flatten().tolist()
 
262
  fig_bars.add_trace(go.Bar(
263
  x=categories,
264
  y=values,
@@ -269,7 +297,8 @@ with tab1:
269
  # Update layout to use a custom color scale
270
  fig_bars.update_layout(
271
  showlegend=True,
272
- legend=dict(orientation="h", yanchor="top", y=-0.3, xanchor="center", x=0.5),
 
273
  title="Comparison of Selected Models",
274
  yaxis_title="Score",
275
  template="plotly_dark"
@@ -278,29 +307,25 @@ with tab1:
278
  st.plotly_chart(fig_bars)
279
 
280
 
281
- ### Zakładka 2 --> Opis
282
  with tab2:
283
  st.markdown("""
284
  ### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span>
285
  1. **Need**: Models face significant challenges when dealing with understanding complex, context-reliant texts that involve meanings implied beyond the literal content of a statement. Such cases include sarcasm, implicatures, and phraseological compounds.
286
-
287
  Traditional sentiment classifiers typically rely on word-based features (e.g., identifying positive or negative words) to assess sentiment. However, with sarcasm, the literal meaning of words often contradicts the intended sentiment, making it difficult for models to accurately gauge tone. Sarcasm's context-dependence further complicates matters, as these classifiers typically lack the ability to grasp nuanced cues in context, especially when sarcasm is subtle.
288
  Similarly, classifiers struggle with implicatures, where the underlying intent is implied rather than explicitly stated. Here, models fail to capture the full sentiment because they rely heavily on surface-level words, missing the non-literal meaning that often drives the sentiment.
289
  Phraseological compounds add another layer of difficulty. These are fixed or semi-fixed expressions whose meanings cannot be directly inferred from the individual words. Language models, trained on word-level patterns, often misinterpret these expressions because they fail to recognize the idiomatic or non-literal meaning, leading to inaccurate sentiment analysis.
290
  In addition to sentiment analysis, we decided to include the understanding of more complex texts in the benchmark, which was measured by the ability to uncover the intended meaning.
291
-
292
  ### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span>
293
  The dataset contains 200 examples, all written in Polish. Each example consists of the following:
294
  - **Main Text**: This is a statement (often an opinion) on any topic that includes a certain type of implicature, often several simultaneously, such as sarcasm or phraseological compounds.
295
  - **Reference Sentiment**: The sentiment associated with the main text. We use three categories: negative, neutral, and positive. Ambiguous examples were labeled as "neutral" to exclude them from sentiment classification testing.
296
  - **Reference phraseological compounds**: A list of phraseological compounds found in the main text.
297
  - **Reference Explanation**: An explanation of the underlying intentions that the author of the main text might have had.
298
-
299
  ### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span>
300
  We distinguish between two models in the evaluation process:
301
  - **Evaluated Model**: The model that performs specific tasks, is then assessed based on its performance, and added to a ranking.
302
  - **Judge Metamodel**: One of the currently strongest, most versatile LLMs.
303
-
304
  ### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING RESPONSES FROM THE EVALUATED MODEL**</span>
305
  1. For each text in the dataset, the evaluated model was required to list the following in three points:
306
  - The sentiment (only positive/negative).
@@ -316,7 +341,6 @@ We distinguish between two models in the evaluation process:
316
  - **Assistant Prompt**: A human-written example answer for the second example text.
317
  - **User Prompt**: The target text, based on which the evaluated model will be assessed.
318
  3. The decision to split the examples into user prompts and assistant prompts was made due to the better results achieved by the vast majority of models. The two examples were selected based on diversity: one has a negative sentiment and several phraseological compounds, while the other is positive and lacks phraseological compounds.
319
-
320
  ### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING METAMODEL EVALUATIONS**</span>
321
  1. The purpose of the metamodel is to return the following evaluations:
322
  - **Understanding of the Text**: A comparison of the evaluated model's response description to the reference explanation.
@@ -356,9 +380,10 @@ st.markdown("""
356
  - [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/) - methodological support
357
  - [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/) - engineering, methodological support
358
  - [Szymon Baczyński](https://www.linkedin.com/in/szymon-baczynski/) - front-end / streamlit assistant
 
359
  - [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/) - writing text, linguistic support
360
  """)
361
 
362
  st.divider()
363
 
364
- # Run the app with `streamlit run your_script.py`
 
7
  from st_social_media_links import SocialMediaIcons
8
 
9
 
10
+ PARAMS_COLUMN_NAME = "Params"
11
  RESULTS_COLUMN_NAME = "Results"
12
  AVERAGE_COLUMN_NAME = "Average"
13
  SENTIMENT_COLUMN_NAME = "Sentiment"
 
17
  IMPLICATURES_AVERAGE_COLUMN_NAME = "Implicatures average"
18
 
19
  # Function to load data from JSON file
20
+
21
+
22
  @st.cache_data
23
  def load_data(file_path):
24
  with open(file_path, 'r', encoding='utf-8') as file:
25
  data = json.load(file)
26
+ df = pd.DataFrame(data)
27
+ df[AVERAGE_COLUMN_NAME] = df[['Sentiment',
28
+ 'Language understanding', 'Phraseology', 'Tricky questions']].mean(axis=1)
29
+
30
+ df[IMPLICATURES_AVERAGE_COLUMN_NAME] = df[['Sentiment',
31
+ 'Language understanding', 'Phraseology']].mean(axis=1)
32
+ return df
33
 
34
  # Function to style the DataFrame
35
+
36
+
37
  @st.cache_data
38
  def style_dataframe(df: pd.DataFrame):
39
+ df[RESULTS_COLUMN_NAME] = df.apply(lambda row: [
40
+ row[SENTIMENT_COLUMN_NAME], row[UNDERSTANDING_COLUMN_NAME], row[PHRASEOLOGY_COLUMN_NAME], row[TRICKY_QUESTIONS_COLUMN_NAME]], axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  cols = list(df.columns)
42
+
43
+ # move average column
44
+ cols.insert(cols.index(PARAMS_COLUMN_NAME) + 1,
45
+ cols.pop(cols.index(AVERAGE_COLUMN_NAME)))
46
+
47
+ # move impicatures average column
48
+ cols.insert(cols.index(AVERAGE_COLUMN_NAME) + 1,
49
+ cols.pop(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME)))
50
+
51
+ # move results column
52
+ cols.insert(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME) + 1,
53
+ cols.pop(cols.index(RESULTS_COLUMN_NAME)))
54
+ # Insert the new column after the 'Average' column
55
  df = df[cols]
56
+ # Create a color ramp using Seaborn
57
  return df
58
 
59
+
60
  def styler(df: pd.DataFrame):
61
  palette = sns.color_palette("RdYlGn", as_cmap=True)
62
  # Apply reverse color gradient to the "Params" column
63
+ params_palette = sns.color_palette(
64
+ "RdYlGn_r", as_cmap=True) # Reversed RdYlGn palette
65
+ styled_df = df.style.background_gradient(cmap=palette, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
66
+ ).background_gradient(cmap=params_palette, subset=["Params"]
67
+ ).set_properties(**{'text-align': 'center'}, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
68
+ ).format("{:.2f}".center(10), subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
69
+ ).format("{:.1f}".center(10), subset=["Params"])
 
 
 
 
 
 
 
 
 
 
 
70
  return styled_df
71
 
72
 
73
+ # Streamlit app
74
  st.set_page_config(layout="wide")
75
 
76
  st.markdown("""
 
85
  </style>
86
  """, unsafe_allow_html=True)
87
 
88
+ # Prepare layout
89
 
90
  st.markdown("""
91
  <style>
 
98
  .center-text {
99
  text-align: center;
100
  }
 
101
  a:link {color:#FDA428;} /* unvisited link */
102
  a:hover {color:#FDA428;} /* Mouse over link */
103
  a:visited {color:#FDA428;} /* visited link */
104
  a:active {color:#FDA428;} /* selected link */
 
105
  </style>
106
  """, unsafe_allow_html=True)
107
 
 
114
  # ----------------------------------------------------------
115
  st.markdown("""<br>""", unsafe_allow_html=True)
116
 
117
+ # Row: 1 --> Title + links to SpeakLeash.org website / GitHub / X (Twitter)
118
  social_media_links = [
119
  "https://discord.com/invite/ZJwCMrxwT7",
120
  "https://github.com/speakleash",
 
137
  links_color
138
  ]
139
 
140
+ social_media_icons = SocialMediaIcons(
141
+ social_media_links, social_media_links_colors)
142
  social_media_icons.render(justify_content='right')
143
 
144
  st.markdown("""
 
168
 
169
  # Prepare data
170
  data = load_data('data.json')
171
+
172
+ data['Params'] = pd.to_numeric(
173
+ data['Params'].str.replace('B', ''),
174
+ errors='coerce'
175
+ )
176
  data = data.sort_values(by=AVERAGE_COLUMN_NAME, ascending=False)
177
 
178
  # Closing filters in a expander
179
  with st.expander("Filtering benchmark data", icon='🔍'):
180
  # Filtering data, e.g. slider for params, average score, etc.
181
+ col_filter_params, col_filter_average, col_filter_implicatures_average, col_filter_sentiment, col_filter_understanding, col_filter_phraseology, col_filter_tricky_questions = st.columns(
182
+ 7, gap='medium')
183
+
184
  with col_filter_params:
185
+ max_params = data['Params'].max(skipna=True)
186
+ if pd.isna(max_params):
187
+ max_params = 0.0
188
+
189
+ params_slider = st.slider(
190
+ "Models Size [B]",
191
+ min_value=0.0,
192
+ max_value=float(max_params),
193
+ value=(0.0, float(max_params)),
194
+ step=0.1,
195
+ format="%.1f"
196
+ )
197
+ data = data[
198
+ data['Params'].isna() |
199
+ (
200
+ (data['Params'] >= params_slider[0]) &
201
+ (data['Params'] <= params_slider[1])
202
+ )
203
+ ]
204
 
205
  with col_filter_average:
206
+ average_slider = st.slider(
207
+ "Average score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
208
+ data = data[(data[AVERAGE_COLUMN_NAME] >= average_slider[0]) & (
209
+ data[AVERAGE_COLUMN_NAME] <= average_slider[1])]
210
+
211
+ with col_filter_implicatures_average:
212
+ implicatures_average_slider = st.slider(
213
+ "Implicatures average", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
214
+ data = data[(data[IMPLICATURES_AVERAGE_COLUMN_NAME] >= implicatures_average_slider[0]) & (
215
+ data[IMPLICATURES_AVERAGE_COLUMN_NAME] <= implicatures_average_slider[1])]
216
 
217
  with col_filter_sentiment:
218
+ sentiment_slider = st.slider(
219
+ "Sentiment score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
220
+ data = data[(data[SENTIMENT_COLUMN_NAME] >= sentiment_slider[0]) & (
221
+ data[SENTIMENT_COLUMN_NAME] <= sentiment_slider[1])]
222
 
223
  with col_filter_understanding:
224
+ understanding_slider = st.slider(
225
+ "Understanding score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
226
+ data = data[(data[UNDERSTANDING_COLUMN_NAME] >= understanding_slider[0]) & (
227
+ data[UNDERSTANDING_COLUMN_NAME] <= understanding_slider[1])]
228
 
229
  with col_filter_phraseology:
230
+ phraseology_slider = st.slider(
231
+ "Phraseology score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
232
+ data = data[(data[PHRASEOLOGY_COLUMN_NAME] >= phraseology_slider[0]) & (
233
+ data[PHRASEOLOGY_COLUMN_NAME] <= phraseology_slider[1])]
234
+
235
+ with col_filter_tricky_questions:
236
+ tricky_questions_slider = st.slider(
237
+ "Tricky questions score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
238
+ data = data[(data[TRICKY_QUESTIONS_COLUMN_NAME] >= tricky_questions_slider[0]) & (
239
+ data[TRICKY_QUESTIONS_COLUMN_NAME] <= tricky_questions_slider[1])]
240
 
241
  # Extract unique provider names from the "Model" column
242
+ providers = data["Model"].apply(
243
+ lambda x: x.split('/')[0].lower()).unique()
244
+ selected_providers = st.multiselect(
245
+ "Model providers", providers, default=providers)
246
  # Filter data based on selected providers
247
+ data = data[data["Model"].apply(lambda x: x.split(
248
+ '/')[0].lower()).isin(selected_providers)]
249
 
250
+ # Display data
251
  styled_df_show = style_dataframe(data)
252
  styled_df_show = styler(styled_df_show)
253
 
 
257
  AVERAGE_COLUMN_NAME: st.column_config.NumberColumn(AVERAGE_COLUMN_NAME),
258
  RESULTS_COLUMN_NAME: st.column_config.BarChartColumn(
259
  "Bar chart of results", help="Summary of the results of each task",
260
+ y_min=0, y_max=5,),
261
  SENTIMENT_COLUMN_NAME: st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment'),
262
  UNDERSTANDING_COLUMN_NAME: st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language'),
263
  PHRASEOLOGY_COLUMN_NAME: st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds'),
264
  TRICKY_QUESTIONS_COLUMN_NAME: st.column_config.NumberColumn(TRICKY_QUESTIONS_COLUMN_NAME, help='Ability to understand tricky questions'),
265
+ }, hide_index=True, disabled=True, height=500)
 
266
 
267
  # Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
268
  # Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
269
+ default_models = list(data.sort_values(
270
+ AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3))
271
+ bielik_model = data[data['Model'].str.contains('Bielik')].sort_values(
272
+ AVERAGE_COLUMN_NAME, ascending=False)['Model'].iloc[0]
273
  if bielik_model not in default_models:
274
  default_models.append(bielik_model)
275
+ selected_models = st.multiselect(
276
+ "Select models to compare", data["Model"].unique(), default=default_models)
277
  selected_data = data[data["Model"].isin(selected_models)]
278
+ categories = [AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
279
+ PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]
280
 
281
  if selected_models:
282
  # Kolorki do wyboru:
 
285
 
286
  fig_bars = go.Figure()
287
  for model, color in zip(selected_models, colors):
288
+ values = selected_data[selected_data['Model'] ==
289
+ model][categories].values.flatten().tolist()
290
  fig_bars.add_trace(go.Bar(
291
  x=categories,
292
  y=values,
 
297
  # Update layout to use a custom color scale
298
  fig_bars.update_layout(
299
  showlegend=True,
300
+ legend=dict(orientation="h", yanchor="top",
301
+ y=-0.3, xanchor="center", x=0.5),
302
  title="Comparison of Selected Models",
303
  yaxis_title="Score",
304
  template="plotly_dark"
 
307
  st.plotly_chart(fig_bars)
308
 
309
 
310
+ # Zakładka 2 --> Opis
311
  with tab2:
312
  st.markdown("""
313
  ### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span>
314
  1. **Need**: Models face significant challenges when dealing with understanding complex, context-reliant texts that involve meanings implied beyond the literal content of a statement. Such cases include sarcasm, implicatures, and phraseological compounds.
 
315
  Traditional sentiment classifiers typically rely on word-based features (e.g., identifying positive or negative words) to assess sentiment. However, with sarcasm, the literal meaning of words often contradicts the intended sentiment, making it difficult for models to accurately gauge tone. Sarcasm's context-dependence further complicates matters, as these classifiers typically lack the ability to grasp nuanced cues in context, especially when sarcasm is subtle.
316
  Similarly, classifiers struggle with implicatures, where the underlying intent is implied rather than explicitly stated. Here, models fail to capture the full sentiment because they rely heavily on surface-level words, missing the non-literal meaning that often drives the sentiment.
317
  Phraseological compounds add another layer of difficulty. These are fixed or semi-fixed expressions whose meanings cannot be directly inferred from the individual words. Language models, trained on word-level patterns, often misinterpret these expressions because they fail to recognize the idiomatic or non-literal meaning, leading to inaccurate sentiment analysis.
318
  In addition to sentiment analysis, we decided to include the understanding of more complex texts in the benchmark, which was measured by the ability to uncover the intended meaning.
 
319
  ### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span>
320
  The dataset contains 200 examples, all written in Polish. Each example consists of the following:
321
  - **Main Text**: This is a statement (often an opinion) on any topic that includes a certain type of implicature, often several simultaneously, such as sarcasm or phraseological compounds.
322
  - **Reference Sentiment**: The sentiment associated with the main text. We use three categories: negative, neutral, and positive. Ambiguous examples were labeled as "neutral" to exclude them from sentiment classification testing.
323
  - **Reference phraseological compounds**: A list of phraseological compounds found in the main text.
324
  - **Reference Explanation**: An explanation of the underlying intentions that the author of the main text might have had.
 
325
  ### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span>
326
  We distinguish between two models in the evaluation process:
327
  - **Evaluated Model**: The model that performs specific tasks, is then assessed based on its performance, and added to a ranking.
328
  - **Judge Metamodel**: One of the currently strongest, most versatile LLMs.
 
329
  ### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING RESPONSES FROM THE EVALUATED MODEL**</span>
330
  1. For each text in the dataset, the evaluated model was required to list the following in three points:
331
  - The sentiment (only positive/negative).
 
341
  - **Assistant Prompt**: A human-written example answer for the second example text.
342
  - **User Prompt**: The target text, based on which the evaluated model will be assessed.
343
  3. The decision to split the examples into user prompts and assistant prompts was made due to the better results achieved by the vast majority of models. The two examples were selected based on diversity: one has a negative sentiment and several phraseological compounds, while the other is positive and lacks phraseological compounds.
 
344
  ### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING METAMODEL EVALUATIONS**</span>
345
  1. The purpose of the metamodel is to return the following evaluations:
346
  - **Understanding of the Text**: A comparison of the evaluated model's response description to the reference explanation.
 
380
  - [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/) - methodological support
381
  - [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/) - engineering, methodological support
382
  - [Szymon Baczyński](https://www.linkedin.com/in/szymon-baczynski/) - front-end / streamlit assistant
383
+ - [Artur Słomowski](https://www.linkedin.com/in/arturslomowski/) - front-end / streamlit assistant
384
  - [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/) - writing text, linguistic support
385
  """)
386
 
387
  st.divider()
388
 
389
+ # Run the app with `streamlit run your_script.py`
data.json CHANGED
@@ -2,523 +2,497 @@
2
  {
3
  "Model": "mistralai/Mistral-Large-Instruct-2407",
4
  "Params": "123B",
5
- "Average": 4.03025641025641,
6
  "Sentiment": 4.230769230769231,
7
  "Language understanding": 4.0,
8
  "Phraseology": 3.86,
9
- "Tricky questions": 4.0
10
  },
11
  {
12
  "Model": "alpindale/WizardLM-2-8x22B",
13
  "Params": "141B",
14
- "Average": 3.9133760683760683,
15
  "Sentiment": 3.7051282051282053,
16
  "Language understanding": 3.815,
17
  "Phraseology": 4.22,
18
- "Tricky questions": 4.0
19
  },
20
  {
21
  "Model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
22
  "Params": "70.6B",
23
- "Average": 3.828974358974359,
24
  "Sentiment": 4.326923076923077,
25
  "Language understanding": 3.91,
26
  "Phraseology": 3.25,
27
- "Tricky questions": 4.0
28
  },
29
  {
30
  "Model": "meta-llama/Meta-Llama-3-70B-Instruct",
31
  "Params": "70.6B",
32
- "Average": 3.806538461538462,
33
  "Sentiment": 4.134615384615385,
34
  "Language understanding": 3.82,
35
  "Phraseology": 3.465,
36
- "Tricky questions": 4.0
37
  },
38
  {
39
  "Model": "speakleash/Bielik-11B-v2.3-Instruct",
40
  "Params": "11.2B",
41
- "Average": 3.7697863247863252,
42
  "Sentiment": 3.9743589743589745,
43
  "Language understanding": 3.785,
44
  "Phraseology": 3.55,
45
- "Tricky questions": 4.0
46
  },
47
  {
48
  "Model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
49
  "Params": "141B",
50
- "Average": 3.6690170940170943,
51
  "Sentiment": 3.782051282051282,
52
  "Language understanding": 3.675,
53
  "Phraseology": 3.55,
54
- "Tricky questions": 4.0
55
  },
56
  {
57
  "Model": "speakleash/Bielik-11B-v2.1-Instruct",
58
  "Params": "11.2B",
59
- "Average": 3.6583760683760684,
60
  "Sentiment": 3.9551282051282053,
61
  "Language understanding": 3.915,
62
  "Phraseology": 3.105,
63
- "Tricky questions": 4.0
64
  },
65
  {
66
  "Model": "Qwen/Qwen2-72B-Instruct",
67
  "Params": "72.7B",
68
- "Average": 3.6442735042735044,
69
  "Sentiment": 3.7628205128205128,
70
  "Language understanding": 3.89,
71
  "Phraseology": 3.28,
72
- "Tricky questions": 4.0
73
  },
74
  {
75
  "Model": "speakleash/Bielik-11B-v2.0-Instruct",
76
  "Params": "11.2B",
77
- "Average": 3.614786324786325,
78
  "Sentiment": 3.9743589743589745,
79
  "Language understanding": 3.745,
80
  "Phraseology": 3.125,
81
- "Tricky questions": 4.0
82
  },
83
  {
84
  "Model": "speakleash/Bielik-11B-v2.2-Instruct",
85
  "Params": "11.2B",
86
- "Average": 3.565982905982906,
87
  "Sentiment": 3.717948717948718,
88
  "Language understanding": 3.73,
89
  "Phraseology": 3.25,
90
- "Tricky questions": 4.0
91
  },
92
  {
93
  "Model": "Qwen/Qwen1.5-72B-Chat",
94
  "Params": "72.3B",
95
- "Average": 3.3214529914529916,
96
  "Sentiment": 3.4743589743589745,
97
  "Language understanding": 3.515,
98
  "Phraseology": 2.975,
99
- "Tricky questions": 4.0
100
  },
101
  {
102
  "Model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
103
  "Params": "8.03B",
104
- "Average": 3.3114529914529918,
105
  "Sentiment": 3.9743589743589745,
106
  "Language understanding": 3.38,
107
  "Phraseology": 2.58,
108
- "Tricky questions": 4.0
109
  },
110
  {
111
  "Model": "THUDM/glm-4-9b-chat",
112
  "Params": "9.4B",
113
- "Average": 3.2749145299145295,
114
  "Sentiment": 3.58974358974359,
115
  "Language understanding": 3.455,
116
  "Phraseology": 2.78,
117
- "Tricky questions": 4.0
118
  },
119
  {
120
  "Model": "mistralai/Mistral-Nemo-Instruct-2407",
121
  "Params": "12.2B",
122
- "Average": 3.223675213675214,
123
  "Sentiment": 3.641025641025641,
124
  "Language understanding": 3.29,
125
  "Phraseology": 2.74,
126
- "Tricky questions": 4.0
127
  },
128
  {
129
  "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
130
  "Params": "8.03B",
131
- "Average": 3.172777777777778,
132
  "Sentiment": 3.3333333333333335,
133
  "Language understanding": 3.15,
134
  "Phraseology": 3.035,
135
- "Tricky questions": 4.0
136
  },
137
  {
138
  "Model": "upstage/SOLAR-10.7B-Instruct-v1.0",
139
  "Params": "10.7B",
140
- "Average": 3.1343162393162394,
141
  "Sentiment": 2.967948717948718,
142
  "Language understanding": 3.18,
143
  "Phraseology": 3.255,
144
- "Tricky questions": 4.0
145
  },
146
  {
147
  "Model": "speakleash/Bielik-7B-Instruct-v0.1",
148
  "Params": "7.24B",
149
- "Average": 3.126581196581197,
150
  "Sentiment": 3.58974358974359,
151
  "Language understanding": 3.475,
152
  "Phraseology": 2.315,
153
- "Tricky questions": 4.0
154
  },
155
  {
156
  "Model": "openchat/openchat-3.5-0106-gemma",
157
  "Params": "8.54B",
158
- "Average": 3.08525641025641,
159
  "Sentiment": 3.730769230769231,
160
  "Language understanding": 3.08,
161
  "Phraseology": 2.445,
162
- "Tricky questions": 4.0
163
  },
164
  {
165
  "Model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
166
  "Params": "46.7B",
167
- "Average": 3.039230769230769,
168
  "Sentiment": 3.0576923076923075,
169
  "Language understanding": 3.175,
170
  "Phraseology": 2.885,
171
- "Tricky questions": 4.0
172
  },
173
  {
174
  "Model": "mistralai/Mistral-7B-Instruct-v0.3",
175
  "Params": "7.25B",
176
- "Average": 3.022307692307692,
177
  "Sentiment": 3.326923076923077,
178
  "Language understanding": 3.06,
179
  "Phraseology": 2.68,
180
- "Tricky questions": 4.0
181
  },
182
  {
183
  "Model": "berkeley-nest/Starling-LM-7B-alpha",
184
  "Params": "7.24B",
185
- "Average": 2.945897435897436,
186
  "Sentiment": 3.0576923076923075,
187
  "Language understanding": 2.925,
188
  "Phraseology": 2.855,
189
- "Tricky questions": 4.0
190
  },
191
  {
192
  "Model": "openchat/openchat-3.5-0106",
193
  "Params": "7.24B",
194
- "Average": 2.8500854700854696,
195
  "Sentiment": 3.16025641025641,
196
  "Language understanding": 2.835,
197
  "Phraseology": 2.555,
198
- "Tricky questions": 4.0
199
  },
200
  {
201
  "Model": "internlm/internlm2-chat-20b",
202
  "Params": "19.9B",
203
- "Average": 2.8237606837606837,
204
  "Sentiment": 3.301282051282051,
205
  "Language understanding": 2.785,
206
  "Phraseology": 2.385,
207
- "Tricky questions": 4.0
208
  },
209
  {
210
  "Model": "01-ai/Yi-1.5-34B-Chat",
211
  "Params": "34.4B",
212
- "Average": 2.7756410256410255,
213
  "Sentiment": 3.076923076923077,
214
  "Language understanding": 2.87,
215
  "Phraseology": 2.38,
216
- "Tricky questions": 4.0
217
  },
218
  {
219
  "Model": "Voicelab/trurl-2-13b-academic",
220
  "Params": "13B",
221
- "Average": 2.74042735042735,
222
  "Sentiment": 3.301282051282051,
223
  "Language understanding": 2.755,
224
  "Phraseology": 2.165,
225
- "Tricky questions": 4.0
226
  },
227
  {
228
  "Model": "google/gemma-2-2b-it",
229
  "Params": "2.61B",
230
- "Average": 2.7974786324786325,
231
  "Sentiment": 3.3974358974359,
232
  "Language understanding": 2.9,
233
  "Phraseology": 2.095,
234
- "Tricky questions": 4.0
235
  },
236
  {
237
  "Model": "Qwen/Qwen2.5-3B-Instruct",
238
  "Params": "3.09B",
239
- "Average": 2.734572649572649,
240
  "Sentiment": 2.948717948717949,
241
  "Language understanding": 2.455,
242
  "Phraseology": 2.8,
243
- "Tricky questions": 4.0
244
  },
245
  {
246
  "Model": "NousResearch/Hermes-3-Llama-3.2-3B",
247
  "Params": "3.21B",
248
- "Average": 2.695128205128205,
249
  "Sentiment": 2.6153846153846154,
250
  "Language understanding": 2.705,
251
  "Phraseology": 2.765,
252
- "Tricky questions": 4.0
253
  },
254
  {
255
  "Model": "ibm-granite/granite-3.1-2b-instruct",
256
  "Params": "2.53B",
257
- "Average": 2.397307692307692,
258
  "Sentiment": 3.076923076923077,
259
  "Language understanding": 2.235,
260
  "Phraseology": 1.88,
261
- "Tricky questions": 4.0
262
  },
263
  {
264
  "Model": "meta-llama/Llama-3.2-1B-Instruct",
265
  "Params": "1.24B",
266
- "Average": 2.383974358974359,
267
  "Sentiment": 3.076923076923077,
268
  "Language understanding": 1.735,
269
  "Phraseology": 2.34,
270
- "Tricky questions": 4.0
271
  },
272
  {
273
  "Model": "microsoft/Phi-3.5-mini-instruct",
274
  "Params": "3.82B",
275
- "Average": 2.331965811965812,
276
  "Sentiment": 2.435897435897436,
277
  "Language understanding": 2.135,
278
  "Phraseology": 2.425,
279
- "Tricky questions": 4.0
280
  },
281
  {
282
  "Model": "meta-llama/Llama-3.2-3B-Instruct",
283
  "Params": "3.21B",
284
- "Average": 2.257136752136752,
285
  "Sentiment": 2.7564102564102564,
286
  "Language understanding": 2.295,
287
  "Phraseology": 1.72,
288
- "Tricky questions": 4.0
289
  },
290
  {
291
  "Model": "h2oai/h2o-danube2-1.8b-chat",
292
  "Params": "1.83B",
293
- "Average": 2.1455982905982904,
294
  "Sentiment": 2.371794871794872,
295
  "Language understanding": 1.595,
296
  "Phraseology": 2.47,
297
- "Tricky questions": 4.0
298
  },
299
  {
300
  "Model": "Qwen/Qwen2.5-1.5B-Instruct",
301
  "Params": "1.54B",
302
- "Average": 2.1232905982905983,
303
  "Sentiment": 2.7948717948717947,
304
  "Language understanding": 1.35,
305
  "Phraseology": 2.225,
306
- "Tricky questions": 4.0
307
  },
308
  {
309
  "Model": "utter-project/EuroLLM-1.7B-Instruct",
310
  "Params": "1.66B",
311
- "Average": 2.097863247863248,
312
  "Sentiment": 2.243589743589744,
313
  "Language understanding": 1.79,
314
  "Phraseology": 2.26,
315
- "Tricky questions": 4.0
316
  },
317
  {
318
  "Model": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
319
  "Params": "2.41B",
320
- "Average": 2.062846282695529,
321
  "Sentiment": 1.9423076923076923,
322
  "Language understanding": 2.1155778894472363,
323
  "Phraseology": 2.130653266331658,
324
- "Tricky questions": 4.0
325
  },
326
  {
327
  "Model": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
328
  "Params": "1.71B",
329
- "Average": 1.9102136752136751,
330
  "Sentiment": 2.275641025641025,
331
  "Language understanding": 1.1,
332
  "Phraseology": 2.355,
333
- "Tricky questions": 4.0
334
  },
335
  {
336
  "Model": "Qwen/Qwen2.5-0.5B-Instruct",
337
  "Params": "0.49B",
338
- "Average": 1.7950427350427354,
339
  "Sentiment": 1.955128205128205,
340
  "Language understanding": 0.835,
341
  "Phraseology": 2.595,
342
- "Tricky questions": 4.0
343
  },
344
  {
345
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-chat",
346
  "Params": "70.6B",
347
- "Average": 3.63,
348
  "Sentiment": 3.94,
349
  "Language understanding": 3.61,
350
  "Phraseology": 3.35,
351
- "Tricky questions": 4.0
352
  },
353
  {
354
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-instruct",
355
  "Params": "46.7B",
356
- "Average": 3.56,
357
  "Sentiment": 3.88,
358
  "Language understanding": 3.59,
359
  "Phraseology": 3.22,
360
- "Tricky questions": 4.0
361
  },
362
  {
363
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-instruct",
364
  "Params": "70.6B",
365
- "Average": 3.56,
366
  "Sentiment": 3.78,
367
  "Language understanding": 3.63,
368
  "Phraseology": 3.26,
369
- "Tricky questions": 4.0
370
  },
371
  {
372
  "Model": "CYFRAGOVPL/PLLuM-8x7B-instruct",
373
  "Params": "46.7B",
374
- "Average": 3.5,
375
  "Sentiment": 3.59,
376
  "Language understanding": 3.47,
377
  "Phraseology": 3.46,
378
- "Tricky questions": 4.0
379
  },
380
  {
381
  "Model": "CYFRAGOVPL/PLLuM-12B-instruct",
382
  "Params": "12.2B",
383
- "Average": 3.49,
384
  "Sentiment": 3.71,
385
  "Language understanding": 3.17,
386
  "Phraseology": 3.59,
387
- "Tricky questions": 4.0
388
  },
389
  {
390
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-chat",
391
  "Params": "46.7B",
392
- "Average": 3.44,
393
  "Sentiment": 3.76,
394
  "Language understanding": 3.48,
395
  "Phraseology": 3.08,
396
- "Tricky questions": 4.0
397
  },
398
  {
399
  "Model": "CYFRAGOVPL/PLLuM-8x7B-chat",
400
  "Params": "46.7B",
401
- "Average": 3.41,
402
  "Sentiment": 3.44,
403
  "Language understanding": 3.45,
404
  "Phraseology": 3.35,
405
- "Tricky questions": 4.0
406
  },
407
  {
408
  "Model": "CYFRAGOVPL/PLLuM-12B-chat",
409
  "Params": "12.2B",
410
- "Average": 3.32,
411
  "Sentiment": 3.32,
412
  "Language understanding": 3.21,
413
  "Phraseology": 3.43,
414
- "Tricky questions": 4.0
415
  },
416
  {
417
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-instruct",
418
  "Params": "12.2B",
419
- "Average": 3.29,
420
  "Sentiment": 3.24,
421
  "Language understanding": 3.31,
422
  "Phraseology": 3.32,
423
- "Tricky questions": 4.0
424
  },
425
  {
426
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-instruct",
427
  "Params": "8.03B",
428
- "Average": 3.2,
429
  "Sentiment": 3.24,
430
- "Language understanding": 2.9,
431
  "Phraseology": 3.46,
432
- "Tricky questions": 4.0
433
  },
434
  {
435
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
436
  "Params": "8.03B",
437
- "Average": 3.14,
438
  "Sentiment": 3.13,
439
  "Language understanding": 2.93,
440
  "Phraseology": 3.36,
441
- "Tricky questions": 4.0
442
  },
443
  {
444
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
445
  "Params": "12.2B",
446
- "Average": 3.33,
447
  "Sentiment": 3.22,
448
  "Language understanding": 3.23,
449
  "Phraseology": 3.54,
450
- "Tricky questions": 4.0
451
  },
452
  {
453
  "Model": "Qwen/Qwen2.5-72B-Instruct",
454
  "Params": "72.7B",
455
- "Average": 3.9923076923076923,
456
  "Sentiment": 4.076923076923077,
457
  "Language understanding": 3.97,
458
  "Phraseology": 3.93,
459
- "Tricky questions": 4.0
460
  },
461
  {
462
  "Model": "Qwen/Qwen2.5-32B-Instruct",
463
  "Params": "32.8B",
464
- "Average": 3.8047008547008545,
465
  "Sentiment": 3.8141025641025643,
466
  "Language understanding": 3.565,
467
  "Phraseology": 4.035,
468
- "Tricky questions": 4.0
469
  },
470
  {
471
  "Model": "mistralai/Mistral-Small-24B-Instruct-2501",
472
  "Params": "23.6B",
473
- "Average": 3.79508547008547,
474
  "Sentiment": 3.91025641025641,
475
  "Language understanding": 3.6,
476
  "Phraseology": 3.875,
477
- "Tricky questions": 4.0
478
  },
479
  {
480
  "Model": "meta-llama/Llama-3.3-70B-Instruct",
481
  "Params": "70.6B",
482
- "Average": 3.7332905982905977,
483
  "Sentiment": 4.294871794871795,
484
  "Language understanding": 3.865,
485
  "Phraseology": 3.04,
486
- "Tricky questions": 4.0
487
  },
488
  {
489
  "Model": "Qwen/Qwen2.5-14B-Instruct",
490
  "Params": "14.8B",
491
- "Average": 3.61508547008547,
492
  "Sentiment": 3.91025641025641,
493
  "Language understanding": 3.565,
494
  "Phraseology": 3.37,
495
- "Tricky questions": 4.0
496
  },
497
  {
498
  "Model": "microsoft/phi-4",
499
  "Params": "14.7B",
500
- "Average": 3.4976495726495727,
501
  "Sentiment": 3.717948717948718,
502
  "Language understanding": 3.54,
503
  "Phraseology": 3.235,
504
- "Tricky questions": 4.0
505
  },
506
  {
507
  "Model": "Qwen/Qwen2.5-7B-Instruct",
508
  "Params": "7.62B",
509
- "Average": 3.2258974358974357,
510
  "Sentiment": 3.5576923076923075,
511
  "Language understanding": 3.025,
512
  "Phraseology": 3.095,
513
- "Tricky questions": 4.0
514
  },
515
  {
516
  "Model": "microsoft/Phi-4-mini-instruct",
517
  "Params": "3.84B",
518
- "Average": 2.455769230769231,
519
  "Sentiment": 2.6923076923076925,
520
  "Language understanding": 2.43,
521
  "Phraseology": 2.245,
522
- "Tricky questions": 4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  }
524
- ]
 
2
  {
3
  "Model": "mistralai/Mistral-Large-Instruct-2407",
4
  "Params": "123B",
 
5
  "Sentiment": 4.230769230769231,
6
  "Language understanding": 4.0,
7
  "Phraseology": 3.86,
8
+ "Tricky questions": 3.9
9
  },
10
  {
11
  "Model": "alpindale/WizardLM-2-8x22B",
12
  "Params": "141B",
 
13
  "Sentiment": 3.7051282051282053,
14
  "Language understanding": 3.815,
15
  "Phraseology": 4.22,
16
+ "Tricky questions": 3.9
17
  },
18
  {
19
  "Model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
20
  "Params": "70.6B",
 
21
  "Sentiment": 4.326923076923077,
22
  "Language understanding": 3.91,
23
  "Phraseology": 3.25,
24
+ "Tricky questions": 3.9
25
  },
26
  {
27
  "Model": "meta-llama/Meta-Llama-3-70B-Instruct",
28
  "Params": "70.6B",
 
29
  "Sentiment": 4.134615384615385,
30
  "Language understanding": 3.82,
31
  "Phraseology": 3.465,
32
+ "Tricky questions": 3.9
33
  },
34
  {
35
  "Model": "speakleash/Bielik-11B-v2.3-Instruct",
36
  "Params": "11.2B",
 
37
  "Sentiment": 3.9743589743589745,
38
  "Language understanding": 3.785,
39
  "Phraseology": 3.55,
40
+ "Tricky questions": 3.9
41
  },
42
  {
43
  "Model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
44
  "Params": "141B",
 
45
  "Sentiment": 3.782051282051282,
46
  "Language understanding": 3.675,
47
  "Phraseology": 3.55,
48
+ "Tricky questions": 3.9
49
  },
50
  {
51
  "Model": "speakleash/Bielik-11B-v2.1-Instruct",
52
  "Params": "11.2B",
 
53
  "Sentiment": 3.9551282051282053,
54
  "Language understanding": 3.915,
55
  "Phraseology": 3.105,
56
+ "Tricky questions": 3.9
57
  },
58
  {
59
  "Model": "Qwen/Qwen2-72B-Instruct",
60
  "Params": "72.7B",
 
61
  "Sentiment": 3.7628205128205128,
62
  "Language understanding": 3.89,
63
  "Phraseology": 3.28,
64
+ "Tricky questions": 3.9
65
  },
66
  {
67
  "Model": "speakleash/Bielik-11B-v2.0-Instruct",
68
  "Params": "11.2B",
 
69
  "Sentiment": 3.9743589743589745,
70
  "Language understanding": 3.745,
71
  "Phraseology": 3.125,
72
+ "Tricky questions": 3.9
73
  },
74
  {
75
  "Model": "speakleash/Bielik-11B-v2.2-Instruct",
76
  "Params": "11.2B",
 
77
  "Sentiment": 3.717948717948718,
78
  "Language understanding": 3.73,
79
  "Phraseology": 3.25,
80
+ "Tricky questions": 3.9
81
  },
82
  {
83
  "Model": "Qwen/Qwen1.5-72B-Chat",
84
  "Params": "72.3B",
 
85
  "Sentiment": 3.4743589743589745,
86
  "Language understanding": 3.515,
87
  "Phraseology": 2.975,
88
+ "Tricky questions": 3.9
89
  },
90
  {
91
  "Model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
92
  "Params": "8.03B",
 
93
  "Sentiment": 3.9743589743589745,
94
  "Language understanding": 3.38,
95
  "Phraseology": 2.58,
96
+ "Tricky questions": 3.9
97
  },
98
  {
99
  "Model": "THUDM/glm-4-9b-chat",
100
  "Params": "9.4B",
 
101
  "Sentiment": 3.58974358974359,
102
  "Language understanding": 3.455,
103
  "Phraseology": 2.78,
104
+ "Tricky questions": 3.9
105
  },
106
  {
107
  "Model": "mistralai/Mistral-Nemo-Instruct-2407",
108
  "Params": "12.2B",
 
109
  "Sentiment": 3.641025641025641,
110
  "Language understanding": 3.29,
111
  "Phraseology": 2.74,
112
+ "Tricky questions": 3.9
113
  },
114
  {
115
  "Model": "meta-llama/Meta-Llama-3-8B-Instruct",
116
  "Params": "8.03B",
 
117
  "Sentiment": 3.3333333333333335,
118
  "Language understanding": 3.15,
119
  "Phraseology": 3.035,
120
+ "Tricky questions": 3.9
121
  },
122
  {
123
  "Model": "upstage/SOLAR-10.7B-Instruct-v1.0",
124
  "Params": "10.7B",
 
125
  "Sentiment": 2.967948717948718,
126
  "Language understanding": 3.18,
127
  "Phraseology": 3.255,
128
+ "Tricky questions": 3.9
129
  },
130
  {
131
  "Model": "speakleash/Bielik-7B-Instruct-v0.1",
132
  "Params": "7.24B",
 
133
  "Sentiment": 3.58974358974359,
134
  "Language understanding": 3.475,
135
  "Phraseology": 2.315,
136
+ "Tricky questions": 3.9
137
  },
138
  {
139
  "Model": "openchat/openchat-3.5-0106-gemma",
140
  "Params": "8.54B",
 
141
  "Sentiment": 3.730769230769231,
142
  "Language understanding": 3.08,
143
  "Phraseology": 2.445,
144
+ "Tricky questions": 3.9
145
  },
146
  {
147
  "Model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
148
  "Params": "46.7B",
 
149
  "Sentiment": 3.0576923076923075,
150
  "Language understanding": 3.175,
151
  "Phraseology": 2.885,
152
+ "Tricky questions": 3.9
153
  },
154
  {
155
  "Model": "mistralai/Mistral-7B-Instruct-v0.3",
156
  "Params": "7.25B",
 
157
  "Sentiment": 3.326923076923077,
158
  "Language understanding": 3.06,
159
  "Phraseology": 2.68,
160
+ "Tricky questions": 3.9
161
  },
162
  {
163
  "Model": "berkeley-nest/Starling-LM-7B-alpha",
164
  "Params": "7.24B",
 
165
  "Sentiment": 3.0576923076923075,
166
  "Language understanding": 2.925,
167
  "Phraseology": 2.855,
168
+ "Tricky questions": 3.9
169
  },
170
  {
171
  "Model": "openchat/openchat-3.5-0106",
172
  "Params": "7.24B",
 
173
  "Sentiment": 3.16025641025641,
174
  "Language understanding": 2.835,
175
  "Phraseology": 2.555,
176
+ "Tricky questions": 3.9
177
  },
178
  {
179
  "Model": "internlm/internlm2-chat-20b",
180
  "Params": "19.9B",
 
181
  "Sentiment": 3.301282051282051,
182
  "Language understanding": 2.785,
183
  "Phraseology": 2.385,
184
+ "Tricky questions": 3.9
185
  },
186
  {
187
  "Model": "01-ai/Yi-1.5-34B-Chat",
188
  "Params": "34.4B",
 
189
  "Sentiment": 3.076923076923077,
190
  "Language understanding": 2.87,
191
  "Phraseology": 2.38,
192
+ "Tricky questions": 3.9
193
  },
194
  {
195
  "Model": "Voicelab/trurl-2-13b-academic",
196
  "Params": "13B",
 
197
  "Sentiment": 3.301282051282051,
198
  "Language understanding": 2.755,
199
  "Phraseology": 2.165,
200
+ "Tricky questions": 3.9
201
  },
202
  {
203
  "Model": "google/gemma-2-2b-it",
204
  "Params": "2.61B",
 
205
  "Sentiment": 3.3974358974359,
206
  "Language understanding": 2.9,
207
  "Phraseology": 2.095,
208
+ "Tricky questions": 3.9
209
  },
210
  {
211
  "Model": "Qwen/Qwen2.5-3B-Instruct",
212
  "Params": "3.09B",
 
213
  "Sentiment": 2.948717948717949,
214
  "Language understanding": 2.455,
215
  "Phraseology": 2.8,
216
+ "Tricky questions": 3.9
217
  },
218
  {
219
  "Model": "NousResearch/Hermes-3-Llama-3.2-3B",
220
  "Params": "3.21B",
 
221
  "Sentiment": 2.6153846153846154,
222
  "Language understanding": 2.705,
223
  "Phraseology": 2.765,
224
+ "Tricky questions": 3.9
225
  },
226
  {
227
  "Model": "ibm-granite/granite-3.1-2b-instruct",
228
  "Params": "2.53B",
 
229
  "Sentiment": 3.076923076923077,
230
  "Language understanding": 2.235,
231
  "Phraseology": 1.88,
232
+ "Tricky questions": 3.9
233
  },
234
  {
235
  "Model": "meta-llama/Llama-3.2-1B-Instruct",
236
  "Params": "1.24B",
 
237
  "Sentiment": 3.076923076923077,
238
  "Language understanding": 1.735,
239
  "Phraseology": 2.34,
240
+ "Tricky questions": 3.9
241
  },
242
  {
243
  "Model": "microsoft/Phi-3.5-mini-instruct",
244
  "Params": "3.82B",
 
245
  "Sentiment": 2.435897435897436,
246
  "Language understanding": 2.135,
247
  "Phraseology": 2.425,
248
+ "Tricky questions": 3.9
249
  },
250
  {
251
  "Model": "meta-llama/Llama-3.2-3B-Instruct",
252
  "Params": "3.21B",
 
253
  "Sentiment": 2.7564102564102564,
254
  "Language understanding": 2.295,
255
  "Phraseology": 1.72,
256
+ "Tricky questions": 3.9
257
  },
258
  {
259
  "Model": "h2oai/h2o-danube2-1.8b-chat",
260
  "Params": "1.83B",
 
261
  "Sentiment": 2.371794871794872,
262
  "Language understanding": 1.595,
263
  "Phraseology": 2.47,
264
+ "Tricky questions": 3.9
265
  },
266
  {
267
  "Model": "Qwen/Qwen2.5-1.5B-Instruct",
268
  "Params": "1.54B",
 
269
  "Sentiment": 2.7948717948717947,
270
  "Language understanding": 1.35,
271
  "Phraseology": 2.225,
272
+ "Tricky questions": 3.9
273
  },
274
  {
275
  "Model": "utter-project/EuroLLM-1.7B-Instruct",
276
  "Params": "1.66B",
 
277
  "Sentiment": 2.243589743589744,
278
  "Language understanding": 1.79,
279
  "Phraseology": 2.26,
280
+ "Tricky questions": 3.9
281
  },
282
  {
283
  "Model": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
284
  "Params": "2.41B",
 
285
  "Sentiment": 1.9423076923076923,
286
  "Language understanding": 2.1155778894472363,
287
  "Phraseology": 2.130653266331658,
288
+ "Tricky questions": 3.9
289
  },
290
  {
291
  "Model": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
292
  "Params": "1.71B",
 
293
  "Sentiment": 2.275641025641025,
294
  "Language understanding": 1.1,
295
  "Phraseology": 2.355,
296
+ "Tricky questions": 3.9
297
  },
298
  {
299
  "Model": "Qwen/Qwen2.5-0.5B-Instruct",
300
  "Params": "0.49B",
 
301
  "Sentiment": 1.955128205128205,
302
  "Language understanding": 0.835,
303
  "Phraseology": 2.595,
304
+ "Tricky questions": 3.9
305
  },
306
  {
307
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-chat",
308
  "Params": "70.6B",
 
309
  "Sentiment": 3.94,
310
  "Language understanding": 3.61,
311
  "Phraseology": 3.35,
312
+ "Tricky questions": 3.9
313
  },
314
  {
315
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-instruct",
316
  "Params": "46.7B",
 
317
  "Sentiment": 3.88,
318
  "Language understanding": 3.59,
319
  "Phraseology": 3.22,
320
+ "Tricky questions": 3.9
321
  },
322
  {
323
  "Model": "CYFRAGOVPL/Llama-PLLuM-70B-instruct",
324
  "Params": "70.6B",
 
325
  "Sentiment": 3.78,
326
  "Language understanding": 3.63,
327
  "Phraseology": 3.26,
328
+ "Tricky questions": 3.9
329
  },
330
  {
331
  "Model": "CYFRAGOVPL/PLLuM-8x7B-instruct",
332
  "Params": "46.7B",
 
333
  "Sentiment": 3.59,
334
  "Language understanding": 3.47,
335
  "Phraseology": 3.46,
336
+ "Tricky questions": 3.9
337
  },
338
  {
339
  "Model": "CYFRAGOVPL/PLLuM-12B-instruct",
340
  "Params": "12.2B",
 
341
  "Sentiment": 3.71,
342
  "Language understanding": 3.17,
343
  "Phraseology": 3.59,
344
+ "Tricky questions": 3.9
345
  },
346
  {
347
  "Model": "CYFRAGOVPL/PLLuM-8x7B-nc-chat",
348
  "Params": "46.7B",
 
349
  "Sentiment": 3.76,
350
  "Language understanding": 3.48,
351
  "Phraseology": 3.08,
352
+ "Tricky questions": 3.9
353
  },
354
  {
355
  "Model": "CYFRAGOVPL/PLLuM-8x7B-chat",
356
  "Params": "46.7B",
 
357
  "Sentiment": 3.44,
358
  "Language understanding": 3.45,
359
  "Phraseology": 3.35,
360
+ "Tricky questions": 3.9
361
  },
362
  {
363
  "Model": "CYFRAGOVPL/PLLuM-12B-chat",
364
  "Params": "12.2B",
 
365
  "Sentiment": 3.32,
366
  "Language understanding": 3.21,
367
  "Phraseology": 3.43,
368
+ "Tricky questions": 3.9
369
  },
370
  {
371
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-instruct",
372
  "Params": "12.2B",
 
373
  "Sentiment": 3.24,
374
  "Language understanding": 3.31,
375
  "Phraseology": 3.32,
376
+ "Tricky questions": 3.9
377
  },
378
  {
379
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-instruct",
380
  "Params": "8.03B",
 
381
  "Sentiment": 3.24,
382
+ "Language understanding": 2.90,
383
  "Phraseology": 3.46,
384
+ "Tricky questions": 3.9
385
  },
386
  {
387
  "Model": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
388
  "Params": "8.03B",
 
389
  "Sentiment": 3.13,
390
  "Language understanding": 2.93,
391
  "Phraseology": 3.36,
392
+ "Tricky questions": 3.9
393
  },
394
  {
395
  "Model": "CYFRAGOVPL/PLLuM-12B-nc-chat",
396
  "Params": "12.2B",
 
397
  "Sentiment": 3.22,
398
  "Language understanding": 3.23,
399
  "Phraseology": 3.54,
400
+ "Tricky questions": 3.9
401
  },
402
  {
403
  "Model": "Qwen/Qwen2.5-72B-Instruct",
404
  "Params": "72.7B",
 
405
  "Sentiment": 4.076923076923077,
406
  "Language understanding": 3.97,
407
  "Phraseology": 3.93,
408
+ "Tricky questions": 3.9
409
  },
410
  {
411
  "Model": "Qwen/Qwen2.5-32B-Instruct",
412
  "Params": "32.8B",
 
413
  "Sentiment": 3.8141025641025643,
414
  "Language understanding": 3.565,
415
  "Phraseology": 4.035,
416
+ "Tricky questions": 3.9
417
  },
418
  {
419
  "Model": "mistralai/Mistral-Small-24B-Instruct-2501",
420
  "Params": "23.6B",
 
421
  "Sentiment": 3.91025641025641,
422
  "Language understanding": 3.6,
423
  "Phraseology": 3.875,
424
+ "Tricky questions": 3.9
425
  },
426
  {
427
  "Model": "meta-llama/Llama-3.3-70B-Instruct",
428
  "Params": "70.6B",
 
429
  "Sentiment": 4.294871794871795,
430
  "Language understanding": 3.865,
431
  "Phraseology": 3.04,
432
+ "Tricky questions": 3.9
433
  },
434
  {
435
  "Model": "Qwen/Qwen2.5-14B-Instruct",
436
  "Params": "14.8B",
 
437
  "Sentiment": 3.91025641025641,
438
  "Language understanding": 3.565,
439
  "Phraseology": 3.37,
440
+ "Tricky questions": 3.9
441
  },
442
  {
443
  "Model": "microsoft/phi-4",
444
  "Params": "14.7B",
 
445
  "Sentiment": 3.717948717948718,
446
  "Language understanding": 3.54,
447
  "Phraseology": 3.235,
448
+ "Tricky questions": 3.9
449
  },
450
  {
451
  "Model": "Qwen/Qwen2.5-7B-Instruct",
452
  "Params": "7.62B",
 
453
  "Sentiment": 3.5576923076923075,
454
  "Language understanding": 3.025,
455
  "Phraseology": 3.095,
456
+ "Tricky questions": 3.9
457
  },
458
  {
459
  "Model": "microsoft/Phi-4-mini-instruct",
460
  "Params": "3.84B",
 
461
  "Sentiment": 2.6923076923076925,
462
  "Language understanding": 2.43,
463
  "Phraseology": 2.245,
464
+ "Tricky questions": 3.9
465
+ },
466
+ {
467
+ "Model": "gemini-2.0-flash-001",
468
+ "Params": "",
469
+ "Sentiment": 4.519230769230769,
470
+ "Language understanding": 4.32,
471
+ "Phraseology": 4.34,
472
+ "Tricky questions": 3.9
473
+ },
474
+ {
475
+ "Model": "gemini-2.0-flash-lite-001",
476
+ "Params": "",
477
+ "Sentiment": 4.230769230769231,
478
+ "Language understanding": 4.055,
479
+ "Phraseology": 4.235,
480
+ "Tricky questions": 3.9
481
+ },
482
+ {
483
+ "Model": "deepseek-ai/DeepSeek-V3 (API)",
484
+ "Params": "685B",
485
+ "Sentiment": 4.358974358974359,
486
+ "Language understanding": 4.22,
487
+ "Phraseology": 3.525,
488
+ "Tricky questions": 3.9
489
+ },
490
+ {
491
+ "Model": "google/gemma-3-27b-it (API)",
492
+ "Params": "27.4B",
493
+ "Sentiment": 3.878205128205128,
494
+ "Language understanding": 3.785,
495
+ "Phraseology": 4.025,
496
+ "Tricky questions": 3.9
497
  }
498
+ ]