File size: 25,143 Bytes
737791b
 
 
 
 
 
 
 
 
b18be2e
737791b
 
 
 
 
b18be2e
 
737791b
 
b18be2e
 
737791b
 
 
 
b18be2e
 
 
 
 
 
 
737791b
 
b18be2e
 
737791b
 
b18be2e
 
737791b
b18be2e
 
 
 
 
 
 
 
 
 
 
 
 
737791b
 
 
 
b18be2e
737791b
 
 
b18be2e
 
 
 
 
 
 
737791b
 
 
b18be2e
737791b
 
 
 
 
 
 
 
 
 
 
 
 
 
b18be2e
737791b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18be2e
737791b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18be2e
 
737791b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be75ce2
737791b
 
 
 
 
 
 
 
 
d6e8af2
737791b
 
 
b18be2e
4aa0174
b18be2e
 
 
737791b
 
 
 
 
b18be2e
 
 
737791b
4aa0174
 
 
b18be2e
4aa0174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737791b
 
b18be2e
 
 
 
 
 
 
 
 
 
737791b
 
b18be2e
 
 
 
737791b
 
b18be2e
 
 
 
737791b
 
b18be2e
 
 
 
 
 
 
 
 
 
737791b
 
 
 
 
272b6ce
737791b
c8c7c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b18be2e
 
737791b
 
 
c8c7c16
b18be2e
 
c8c7c16
b18be2e
 
 
 
 
 
 
 
 
 
 
 
 
c8c7c16
b18be2e
737791b
b18be2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737791b
 
 
b18be2e
 
 
 
737791b
 
b18be2e
 
737791b
b18be2e
 
737791b
 
 
 
 
 
 
 
b18be2e
 
737791b
 
 
 
 
 
 
 
 
 
b18be2e
 
737791b
 
 
 
 
 
 
 
b18be2e
737791b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bfc432
737791b
 
 
b18be2e
c948f71
737791b
 
 
 
b18be2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
import json
import streamlit as st
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from st_social_media_links import SocialMediaIcons


PARAMS_COLUMN_NAME = "Params"
RESULTS_COLUMN_NAME = "Results"
AVERAGE_COLUMN_NAME = "Average"
SENTIMENT_COLUMN_NAME = "Sentiment"
UNDERSTANDING_COLUMN_NAME = "Language understanding"
PHRASEOLOGY_COLUMN_NAME = "Phraseology"
TRICKY_QUESTIONS_COLUMN_NAME = "Tricky questions"
IMPLICATURES_AVERAGE_COLUMN_NAME = "Implicatures average"

# Function to load data from JSON file


@st.cache_data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    df = pd.DataFrame(data)
    df[AVERAGE_COLUMN_NAME] = df[['Sentiment',
                                  'Language understanding', 'Phraseology', 'Tricky questions']].mean(axis=1)

    df[IMPLICATURES_AVERAGE_COLUMN_NAME] = df[['Sentiment',
                                               'Language understanding', 'Phraseology']].mean(axis=1)
    return df

# Function to style the DataFrame


@st.cache_data
def style_dataframe(df: pd.DataFrame):
    df[RESULTS_COLUMN_NAME] = df.apply(lambda row: [
                                       row[SENTIMENT_COLUMN_NAME], row[UNDERSTANDING_COLUMN_NAME], row[PHRASEOLOGY_COLUMN_NAME], row[TRICKY_QUESTIONS_COLUMN_NAME]], axis=1)
    cols = list(df.columns)

    # move average column
    cols.insert(cols.index(PARAMS_COLUMN_NAME) + 1,
                cols.pop(cols.index(AVERAGE_COLUMN_NAME)))

    # move impicatures average column
    cols.insert(cols.index(AVERAGE_COLUMN_NAME) + 1,
                cols.pop(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME)))

    # move results column
    cols.insert(cols.index(IMPLICATURES_AVERAGE_COLUMN_NAME) + 1,
                cols.pop(cols.index(RESULTS_COLUMN_NAME)))
    # Insert the new column after the 'Average' column
    df = df[cols]
    # Create a color ramp using Seaborn
    return df


def styler(df: pd.DataFrame):
    palette = sns.color_palette("RdYlGn", as_cmap=True)
    # Apply reverse color gradient to the "Params" column
    params_palette = sns.color_palette(
        "RdYlGn_r", as_cmap=True)  # Reversed RdYlGn palette
    styled_df = df.style.background_gradient(cmap=palette, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
                                             ).background_gradient(cmap=params_palette, subset=["Params"]
                                                                   ).set_properties(**{'text-align': 'center'}, subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
                                                                                    ).format("{:.2f}".center(10), subset=[AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME, IMPLICATURES_AVERAGE_COLUMN_NAME]
                                                                                             ).format("{:.1f}".center(10), subset=["Params"])
    return styled_df


# Streamlit app
st.set_page_config(layout="wide")

st.markdown("""
        <style>
               .block-container {
                    padding-top: 0%;
                    padding-bottom: 0%;
                    padding-left: 3%;
                    padding-right: 3%;
                    scrollbar-width: thin;
                }
        </style>
        """, unsafe_allow_html=True)

# Prepare layout

st.markdown("""
<style>
    .center {
        display: block;
        margin-left: auto;
        margin-right: auto;
        width: 50%;
    }
    .center-text {
        text-align: center;
    }
    a:link {color:#FDA428;}      /* unvisited link */
    a:hover {color:#FDA428;}   /* Mouse over link */
    a:visited {color:#FDA428;}  /* visited link */
    a:active {color:#FDA428;}  /* selected link */
</style>
""", unsafe_allow_html=True)

# --- Colors info ---
# Primary Color: #FDA428
# Secondary Color: #A85E00
# Grey Color: #7B7B7B
# Background Color: #1C1C1C
# {'LOW': '#7B7B7B', 'MEDIUM': '#A85E00', 'HIGH': '#FDA428'}
# ----------------------------------------------------------
st.markdown("""<br>""", unsafe_allow_html=True)

# Row: 1 --> Title + links to SpeakLeash.org website / GitHub / X (Twitter)
social_media_links = [
    "https://discord.com/invite/ZJwCMrxwT7",
    "https://github.com/speakleash",
    "https://x.com/Speak_Leash",
    "https://www.linkedin.com/company/speakleash/",
    "https://www.facebook.com/Speakleash/"
]

light_orange = "#FDA428"
dark_orange = "#A85E00"
white_color = "#FFFFFF"
black_color = "#000000"
links_color = light_orange

social_media_links_colors = [
    links_color,
    links_color,
    links_color,
    links_color,
    links_color
]

social_media_icons = SocialMediaIcons(
    social_media_links, social_media_links_colors)
social_media_icons.render(justify_content='right')

st.markdown("""
    <hr style="margin: 0.5em 0;">
""", unsafe_allow_html=True)

st.markdown("""
    <img src="https://speakleash.org/wp-content/uploads/2023/09/SpeakLeash_logo.svg" alt="SpeakLeash Logo">
""", unsafe_allow_html=True)

# Add logo, title, and subheader in a flexible container with equal spacing
st.markdown("""
    <div class="header-container">
        <br><br>
        <div class="title-container">
            <h1 style='color: #FDA428; margin-top: -1rem; font-size: 3.1em;'>CPTUB - Complex Polish Text Understanding Benchmark</h1>
            <h3 style="margin-top: 0;">Understanding of Polish text, sentiment and phraseological compounds</h2>
        </div>
    </div>
    """, unsafe_allow_html=True)

# Create tabs
tab1, tab2 = st.tabs([RESULTS_COLUMN_NAME, "Description"])

with tab1:
    st.write("This benchmark is designed to evaluate the ability of language models to correctly interpret complex Polish texts, including sarcasm, phraseological compounds, and implicatures. Models are assessed not only on traditional sentiment analysis but also on their ability to understand and interpret more complex language forms. The focus is on how well models can uncover the intended meaning in texts that require going beyond literal word meanings to recognize deeper, context-dependent interpretations.")

    # Prepare data
    data = load_data('data.json')

    data['Params'] = pd.to_numeric(
        data['Params'].str.replace('B', ''),
        errors='coerce'
    )
    data = data.sort_values(by=AVERAGE_COLUMN_NAME, ascending=False)

    # Closing filters in a expander
    with st.expander("Filtering benchmark data", icon='🔍'):
        # Filtering data, e.g. slider for params, average score, etc.
        col_filter_params, col_filter_average, col_filter_implicatures_average, col_filter_sentiment, col_filter_understanding, col_filter_phraseology, col_filter_tricky_questions = st.columns(
            7, gap='medium')

        with col_filter_params:
            max_params = data['Params'].max(skipna=True)
            if pd.isna(max_params):
                max_params = 0.0

            params_slider = st.slider(
                "Models Size [B]",
                min_value=0.0,
                max_value=float(max_params),
                value=(0.0, float(max_params)),
                step=0.1,
                format="%.1f"
            )
            data = data[
                data['Params'].isna() |
                (
                    (data['Params'] >= params_slider[0]) &
                    (data['Params'] <= params_slider[1])
                )
            ]

        with col_filter_average:
            average_slider = st.slider(
                "Average score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
            data = data[(data[AVERAGE_COLUMN_NAME] >= average_slider[0]) & (
                data[AVERAGE_COLUMN_NAME] <= average_slider[1])]

        with col_filter_implicatures_average:
            implicatures_average_slider = st.slider(
                "Implicatures average", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
            data = data[(data[IMPLICATURES_AVERAGE_COLUMN_NAME] >= implicatures_average_slider[0]) & (
                data[IMPLICATURES_AVERAGE_COLUMN_NAME] <= implicatures_average_slider[1])]

        with col_filter_sentiment:
            sentiment_slider = st.slider(
                "Sentiment score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
            data = data[(data[SENTIMENT_COLUMN_NAME] >= sentiment_slider[0]) & (
                data[SENTIMENT_COLUMN_NAME] <= sentiment_slider[1])]

        with col_filter_understanding:
            understanding_slider = st.slider(
                "Understanding score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
            data = data[(data[UNDERSTANDING_COLUMN_NAME] >= understanding_slider[0]) & (
                data[UNDERSTANDING_COLUMN_NAME] <= understanding_slider[1])]

        with col_filter_phraseology:
            phraseology_slider = st.slider(
                "Phraseology score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
            data = data[(data[PHRASEOLOGY_COLUMN_NAME] >= phraseology_slider[0]) & (
                data[PHRASEOLOGY_COLUMN_NAME] <= phraseology_slider[1])]

        with col_filter_tricky_questions:
            tricky_questions_slider = st.slider(
                "Tricky questions score", step=0.1, min_value=0.0, max_value=5.0, value=(0.0, 5.0))
            data = data[(data[TRICKY_QUESTIONS_COLUMN_NAME] >= tricky_questions_slider[0]) & (
                data[TRICKY_QUESTIONS_COLUMN_NAME] <= tricky_questions_slider[1])]

        # Extract unique provider names from the "Model" column
        providers = data["Model"].apply(lambda x: x.split('/')[0].lower()).unique()
        selected_providers = st.multiselect("Model providers", providers, default=providers)
        # Filter data based on selected providers
        data = data[data["Model"].apply(lambda x: x.split('/')[0].lower()).isin(selected_providers)]

    
    # Define all possible columns
    all_columns = {
        "Model": "Model",
        "Params": "Params",
        AVERAGE_COLUMN_NAME: "Average",
        IMPLICATURES_AVERAGE_COLUMN_NAME: "Impl. Avg",
        SENTIMENT_COLUMN_NAME: "Sentiment",
        UNDERSTANDING_COLUMN_NAME: "Understanding",
        PHRASEOLOGY_COLUMN_NAME: "Phraseology",
        TRICKY_QUESTIONS_COLUMN_NAME: "Tricky Questions"
    }
    
    # By default, all columns are selected
    default_columns = list(all_columns.keys())
    
    # Use pills to select visible columns in multi-selection mode
    selected_column_labels = st.pills(
        label="Visible columns",
        options=list(all_columns.values()),
        default=list(all_columns.values()),  # Set all columns as default
        selection_mode="multi",  # Enable multi-selection mode
        key="visible_columns_pills"
    )
    
    # Map selected labels back to column names
    reverse_mapping = {v: k for k, v in all_columns.items()}
    selected_columns = [reverse_mapping[label] for label in selected_column_labels]
    
    # If nothing is selected, show all columns
    if not selected_columns:
        selected_columns = default_columns

    # Display data
    styled_df_show = style_dataframe(data)
    styled_df_show = styler(styled_df_show)

    # Customize column_config based on selected columns
    column_config = {}
    
    # Set configuration for all columns
    if "Model" in styled_df_show.columns:
        column_config["Model"] = st.column_config.TextColumn("Model", help="Model name", width="large") if "Model" in selected_columns else None
    
    if "Params" in styled_df_show.columns:
        column_config["Params"] = st.column_config.NumberColumn("Params [B]") if "Params" in selected_columns else None
    
    if AVERAGE_COLUMN_NAME in styled_df_show.columns:
        column_config[AVERAGE_COLUMN_NAME] = st.column_config.NumberColumn(AVERAGE_COLUMN_NAME) if AVERAGE_COLUMN_NAME in selected_columns else None
    
    if IMPLICATURES_AVERAGE_COLUMN_NAME in styled_df_show.columns:
        column_config[IMPLICATURES_AVERAGE_COLUMN_NAME] = st.column_config.NumberColumn(IMPLICATURES_AVERAGE_COLUMN_NAME) if IMPLICATURES_AVERAGE_COLUMN_NAME in selected_columns else None
    
    if RESULTS_COLUMN_NAME in styled_df_show.columns:
        # Show Results only if Average is selected
        column_config[RESULTS_COLUMN_NAME] = st.column_config.BarChartColumn(
            "Bar chart of results", help="Summary of the results of each task",
            y_min=0, y_max=5) if AVERAGE_COLUMN_NAME in selected_columns else None
    
    if SENTIMENT_COLUMN_NAME in styled_df_show.columns:
        column_config[SENTIMENT_COLUMN_NAME] = st.column_config.NumberColumn(SENTIMENT_COLUMN_NAME, help='Ability to analyze sentiment') if SENTIMENT_COLUMN_NAME in selected_columns else None
    
    if UNDERSTANDING_COLUMN_NAME in styled_df_show.columns:
        column_config[UNDERSTANDING_COLUMN_NAME] = st.column_config.NumberColumn(UNDERSTANDING_COLUMN_NAME, help='Ability to understand language') if UNDERSTANDING_COLUMN_NAME in selected_columns else None
    
    if PHRASEOLOGY_COLUMN_NAME in styled_df_show.columns:
        column_config[PHRASEOLOGY_COLUMN_NAME] = st.column_config.NumberColumn(PHRASEOLOGY_COLUMN_NAME, help='Ability to understand phraseological compounds') if PHRASEOLOGY_COLUMN_NAME in selected_columns else None
    
    if TRICKY_QUESTIONS_COLUMN_NAME in styled_df_show.columns:
        column_config[TRICKY_QUESTIONS_COLUMN_NAME] = st.column_config.NumberColumn(TRICKY_QUESTIONS_COLUMN_NAME, help='Ability to understand tricky questions') if TRICKY_QUESTIONS_COLUMN_NAME in selected_columns else None
    
    st.data_editor(styled_df_show, column_config=column_config, hide_index=True, disabled=True, height=500)

    # Add selection for models and create a bar chart for selected models using the AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME, PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME
    # Add default selection of 3 best models from AVERAGE_COLUMN_NAME and 1 best model with "Bielik" in Model column
    default_models = list(data.sort_values(
        AVERAGE_COLUMN_NAME, ascending=False)['Model'].head(3))
    bielik_model = data[data['Model'].str.contains('Bielik')].sort_values(
        AVERAGE_COLUMN_NAME, ascending=False)['Model'].iloc[0]
    if bielik_model not in default_models:
        default_models.append(bielik_model)
    selected_models = st.multiselect(
        "Select models to compare", data["Model"].unique(), default=default_models)
    selected_data = data[data["Model"].isin(selected_models)]
    categories = [AVERAGE_COLUMN_NAME, SENTIMENT_COLUMN_NAME,
                  PHRASEOLOGY_COLUMN_NAME, UNDERSTANDING_COLUMN_NAME, TRICKY_QUESTIONS_COLUMN_NAME]

    if selected_models:
        # Kolorki do wyboru:
        # colors = px.colors.sample_colorscale("viridis", len(selected_models)+1)
        colors = px.colors.qualitative.G10[:len(selected_models)]

        fig_bars = go.Figure()
        for model, color in zip(selected_models, colors):
            values = selected_data[selected_data['Model'] ==
                                   model][categories].values.flatten().tolist()
            fig_bars.add_trace(go.Bar(
                x=categories,
                y=values,
                name=model,
                marker_color=color
            ))

        # Update layout to use a custom color scale
        fig_bars.update_layout(
            showlegend=True,
            legend=dict(orientation="h", yanchor="top",
                        y=-0.3, xanchor="center", x=0.5),
            title="Comparison of Selected Models",
            yaxis_title="Score",
            template="plotly_dark"
        )
        fig_bars.update_yaxes(range=[0, 5.1])
        st.plotly_chart(fig_bars)


# Zakładka 2 --> Opis
with tab2:
    st.markdown("""
### <span style='text-decoration: #FDA428 wavy underline;'>**Cause of Creation**</span>
1. **Need**: Models face significant challenges when dealing with understanding complex, context-reliant texts that involve meanings implied beyond the literal content of a statement. Such cases include sarcasm, implicatures, and phraseological compounds. 
Traditional sentiment classifiers typically rely on word-based features (e.g., identifying positive or negative words) to assess sentiment. However, with sarcasm, the literal meaning of words often contradicts the intended sentiment, making it difficult for models to accurately gauge tone. Sarcasm's context-dependence further complicates matters, as these classifiers typically lack the ability to grasp nuanced cues in context, especially when sarcasm is subtle.
Similarly, classifiers struggle with implicatures, where the underlying intent is implied rather than explicitly stated. Here, models fail to capture the full sentiment because they rely heavily on surface-level words, missing the non-literal meaning that often drives the sentiment.
Phraseological compounds add another layer of difficulty. These are fixed or semi-fixed expressions whose meanings cannot be directly inferred from the individual words. Language models, trained on word-level patterns, often misinterpret these expressions because they fail to recognize the idiomatic or non-literal meaning, leading to inaccurate sentiment analysis.
In addition to sentiment analysis, we decided to include the understanding of more complex texts in the benchmark, which was measured by the ability to uncover the intended meaning.
### <span style='text-decoration: #FDA428 wavy underline;'>**Dataset Information**</span>
The dataset contains 200 examples, all written in Polish. Each example consists of the following:
- **Main Text**: This is a statement (often an opinion) on any topic that includes a certain type of implicature, often several simultaneously, such as sarcasm or phraseological compounds.
- **Reference Sentiment**: The sentiment associated with the main text. We use three categories: negative, neutral, and positive. Ambiguous examples were labeled as "neutral" to exclude them from sentiment classification testing.
- **Reference phraseological compounds**: A list of phraseological compounds found in the main text.
- **Reference Explanation**: An explanation of the underlying intentions that the author of the main text might have had.
### <span style='text-decoration: #FDA428 wavy underline;'>**Evaluation Procedure**</span>
We distinguish between two models in the evaluation process:
- **Evaluated Model**: The model that performs specific tasks, is then assessed based on its performance, and added to a ranking.
- **Judge Metamodel**: One of the currently strongest, most versatile LLMs.
### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING RESPONSES FROM THE EVALUATED MODEL**</span>
1. For each text in the dataset, the evaluated model was required to list the following in three points:
    - The sentiment (only positive/negative).
    - The underlying intentions of the author of the text.
    - All phraseological compounds present in the text along with their meanings in the given context.
2. No system prompt is used. The prompt provided to the evaluated model is written in Polish, as we are testing the models in this language. It contains:
    - **User Prompt**: 3 elements, each consisting of a header written in capital letters and content enclosed in triple quotes:
        - Information about the role of a careful linguist with extensive experience.
        - The instruction to perform the three previously described tasks.
        - The first example of a text that could be included in the dataset.
    - **Assistant Prompt**: A human-written example answer for the first example text.
    - **User Prompt**: A second example of a text that could be included in the dataset.
    - **Assistant Prompt**: A human-written example answer for the second example text.
    - **User Prompt**: The target text, based on which the evaluated model will be assessed.
3. The decision to split the examples into user prompts and assistant prompts was made due to the better results achieved by the vast majority of models. The two examples were selected based on diversity: one has a negative sentiment and several phraseological compounds, while the other is positive and lacks phraseological compounds.
### <span style='text-decoration: #FDA428 wavy underline;'>**GENERATING METAMODEL EVALUATIONS**</span>
1. The purpose of the metamodel is to return the following evaluations:
    - **Understanding of the Text**: A comparison of the evaluated model's response description to the reference explanation.
    - **Sentiment Analysis**: An optional evaluation, only if the reference sentiment is "positive" or "negative." We made this decision to exclude texts that people might interpret ambiguously.
    - **phraseological compounds**: The model is penalized for phrases not included in the reference phraseological compounds. In cases where there are no phraseological compounds, the highest score is awarded only if the model indicates the absence of such expressions — one point is deducted for each excess phrase until the score reaches zero.
2. Each evaluation is provided in JSON format. Example of a full response from the metamodel:
```json
{"WYDŹWIĘK": "5"}
{"OCENA": "4"}
{"ZWIĄZKI": "3"}
```
3. The judge metamodel's prompt structure is similar to that of the evaluated model's prompt. No system prompt is used. The prompt includes:
    - **User Prompt**: 3 elements, each consisting of a header written in capital letters and content enclosed in triple quotes:
        - **Role**: A reliable assistant who adheres to the instructions and does not perform any other tasks, nor enters any additional text in the response.
        - **Task**: According to the description in point 1. The comparison of phraseological compounds has the most guidelines, so we noted that the model should focus on this as it is the most challenging step, and that its work will be evaluated based on this point.
        - The first example of a potential response from the evaluated model along with the references.
    - **Assistant Prompt**: An example response containing the evaluations.
    - **User Prompt**: A second example of a potential response from the evaluated model along with the references.
    - **Assistant Prompt**: An example response containing the evaluations for the second example.
    - **User Prompt**: The actual response from the evaluated model and the references on which the metamodel will base its evaluations included in the benchmark.
4. Here, the examples were also selected based on diversity. One includes a reference with a positive sentiment, while the other contains no reference sentiment at all (an example labeled as "neutral" in the dataset).
5. It is worth explaining why we chose this particular process for evaluating phraseological compounds. Initially, we intended to check only those phrases included in the reference and ignore others in the evaluation. Unfortunately, this procedure favored models that provided many phrases that were not phraseological compounds.
Therefore, we decided to penalize models for phrases not included in the reference. We aimed to ensure that models were not penalized for providing phraseological compounds we had not included in the reference. After generating the responses, we collected phrases noted by several models and manually reviewed all references to identify phraseological compounds we might have missed.
A similar procedure was applied to sentiment analysis—we listed all examples where several models consistently recorded a different sentiment than the reference and reconsidered whether the examples could be interpreted differently than initially assumed.
    """, unsafe_allow_html=True)


# Ending :)
st.markdown("<hr style='border: 1px solid #A85E00;'>", unsafe_allow_html=True)
# st.divider()
st.markdown("""
### Authors:
- [Jan Sowa](https://www.linkedin.com/in/janpiotrsowa) - leadership, writing texts, benchmark code
- [Agnieszka Kosiak](https://www.linkedin.com/in/agn-kosiak/) - writing texts
- [Magdalena Krawczyk](https://www.linkedin.com/in/magdalena-krawczyk-7810942ab/) - writing texts, labeling
- [Marta Matylda Kania](https://www.linkedin.com/in/martamatyldakania/) - prompt engineering
- [Remigiusz Kinas](https://www.linkedin.com/in/remigiusz-kinas/) - methodological support
- [Krzysztof Wróbel](https://www.linkedin.com/in/wrobelkrzysztof/) - engineering, methodological support
- [Szymon Baczyński](https://www.linkedin.com/in/szymon-baczynski/) - front-end / streamlit assistant
- [Artur Słomowski](https://www.linkedin.com/in/arturslomowski/) - front-end / streamlit assistant
- [Maria Filipkowska](https://www.linkedin.com/in/maria-filipkowska/) - writing text, linguistic support
""")

st.divider()

# Run the app with `streamlit run your_script.py`