File size: 7,596 Bytes
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee4374
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7987659
500fbd7
 
 
 
 
 
 
 
 
 
6ee4374
500fbd7
 
 
 
 
 
 
 
 
 
 
6ee4374
 
 
 
 
 
 
 
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee4374
 
 
 
 
 
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee4374
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee4374
500fbd7
 
 
 
 
 
6ee4374
500fbd7
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import streamlit as st
import pandas as pd
import json
from utils import read_results, preprocess_path, get_model_url
from data import Tasks, Metrics, DATASET_TASK_DICT, TASK_METRIC_DICT, DATASET_GROUPS


st.set_page_config(
    page_title='Cetvel πŸ“',
    layout='centered',
)


@st.cache_data
def cache_results(path):
    json_results = read_results(path)
    results = list()
    for entry in json_results:
        print(entry)
        row = {
            'model': entry['model']['model'],
            'num_parameters': entry['model']['num_parameters'],
            'url': get_model_url(entry['model']),
            'architecture': entry['model']['architecture'],
            'type': entry['model']['type'],
            'precision': entry['model']['dtype'],
        }
        for result in entry['results']:
            task = result['task']
            metric = TASK_METRIC_DICT.get(task)
            score = result.get(metric)
            score = 100 * score if metric != Metrics.WER and score is not None else score
            row[result['name']] = score
        results.append(row)
    df = pd.DataFrame(results)
    for group, metadata in DATASET_GROUPS.items():
        df[group] = df[metadata['datasets']].mean(axis=1)
    return df


@st.cache_data
def cache_datasets(path):
    path = preprocess_path(path)
    with open(path, 'r') as f:
        datasets = json.load(f)
    for key in datasets.keys():
        datasets[key]['dataset'] = key
    del datasets['tr-wikihow-summ']  # FIXME: There are missing experiments.
    return datasets


def create_column_configs(items):
    column_configs = dict()
    for key, metadata in items.items():
        column_configs[key] = st.column_config.NumberColumn(
            metadata.get('name', key),
            help=metadata['description'],
            min_value=0,
            format='%2.2f'
        )
    return column_configs


def insert_average(df, keys):
    df = df.copy(deep=True)
    df['average'] = df.loc[:, [x for x in df.columns if x in keys]].mean(axis=1)
    df.insert(1, 'average', df.pop('average'))
    df.index += 1
    return df.sort_values(by=['average'], ascending=False)

def insert_average_rank(df, keys):
    df = df.copy(deep=True)
    df_ranks = df.loc[:, [x for x in df.columns if x in keys]].rank(ascending = False)
    df['average_rank'] = df_ranks.mean(axis=1)
    df.insert(2, 'average_rank', df.pop('average_rank'))
    df.index += 1
    return df.sort_values(by=['average_rank'], ascending=True)


MODEL_SPEC_CONFIGS = {
    'model': st.column_config.TextColumn(
        'Model',
        help='Large Language Model (LLM) used for the experiments.',
        max_chars=120,
        
    ),
    'url': st.column_config.LinkColumn(
        'URL',
        help='Model URL.',
        display_text='Click',
    ),
    'num_parameters': st.column_config.TextColumn(
        '#params',
        help='Approximate number of parameters.',
    ),
    'type': st.column_config.TextColumn(
        'Type',
        help='Model type based on training objective.',
    ),
    'average': st.column_config.NumberColumn(
        'Avg.',
        help='Average across task or dataset performances.',
        format='%2.2f',
    ),
    'average_rank': st.column_config.NumberColumn(
        'Avg. Rank',
        help='Average ranking across task or dataset performances.',
        format='%2.2f',
    )
}


def filter_visible_model_specs():
    specs = {
        'URL': ('url', 1),
        '#params': ('num_parameters', 2),
        'Architecture': ('architecture', 3),
        'Type': ('type', 4),
        'Precision': ('precision', 5),
    }
    visible_specs = st.multiselect(
        'Select model specs to be shown in the table.',
        options=sorted(specs.keys(), key=lambda x: specs[x][1]),
    )
    # visible_specs = sorted(visible_specs, key=lambda x: specs[x][1])
    return [specs[x][0] for x in visible_specs]


def filter_by_model_spec():
    pass


def filter_visible_datasets(datasets):
    col1, col2 = st.columns(2)
    with col1:
        dataset_grouping = st.selectbox(
            'Dataset Grouping',
            [
                'Group Datasets',
                'Show All Datasets',
            ],
        )

    with col2:
        filter_by_task = st.selectbox(
            'Filter by Task',
            [
                'All',
                'Understanding Tasks',
                'Generation Tasks',
                'Multiple Choice',
                'Extractive Question Answering',
                'Natural Language Inference',
                'Text Classification',
                'Summarization',
            ],
            disabled=dataset_grouping == 'Group Datasets',
        )

    if dataset_grouping == 'Group Datasets':
        return list(DATASET_GROUPS.keys())
    elif dataset_grouping == 'Show All Datasets':
        if filter_by_task == 'All':
            return list(datasets.keys())
        elif filter_by_task == 'Understanding Tasks':
            this_datasets = [k for (k, v) in datasets.items() if not v['generative']]
            return this_datasets
        elif filter_by_task == 'Generation Tasks':
            this_datasets = [k for (k, v) in datasets.items() if v['generative']]
            return this_datasets
        elif filter_by_task == 'Multiple Choice':
            return DATASET_GROUPS['MCQA']['datasets']
        elif filter_by_task == 'Extractive Question Answering':
            return DATASET_GROUPS['QA']['datasets']
        elif filter_by_task == 'Natural Language Inference':
            return DATASET_GROUPS['NLI']['datasets']
        elif filter_by_task == 'Text Classification':
            return DATASET_GROUPS['TC']['datasets']
        elif filter_by_task == 'Summarization':
            return DATASET_GROUPS['SUM']['datasets']


def introduction():
    st.title(':blue[Cetvel :straight_ruler:]')
    st.subheader('A Unified Benchmark for Evaluating Turkish LLMs', anchor=False)
    st.markdown('''Cetvel is an extended version of the [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) tool, specifically includes tasks/datasets for benchmarking Turkish Large Language Models (LLMs). Cetvel includes a variety of tasks curated to assess different aspects of model performance in the Turkish language. Our primary goal is to objectively evaluate the capabilities of large language models in understanding and processing Turkish. For documentation and more details about the benchmark and the experiments, you can check the [GitHub repository](https://github.com/KUIS-AI/Cetvel).''')


def main():
    introduction()
    results_df = cache_results('./results/zero-shot')
    datasets = cache_datasets('./data/datasets.json')
    dataset_column_configs = create_column_configs(datasets)
    group_column_configs = create_column_configs(DATASET_GROUPS)
    # score_columns = list(dataset_column_configs.keys()) + list(group_column_configs.keys())
    column_configs = MODEL_SPEC_CONFIGS | group_column_configs | dataset_column_configs

    visible_data_columns = sorted(filter_visible_datasets(datasets), key=str.casefold)
    visible_model_columns = filter_visible_model_specs()
    results_df = insert_average(results_df, visible_data_columns)
    results_df = insert_average_rank(results_df, visible_data_columns)

    st.dataframe(
        results_df,
        use_container_width=True,
        hide_index=True,
        column_config=column_configs,
        column_order=['model', 'average', 'average_rank',] + visible_model_columns + visible_data_columns,
    )
    st.image('./assets/kuis-ai-logo.png', width=240)


main()