File size: 7,596 Bytes
500fbd7 6ee4374 500fbd7 7987659 500fbd7 6ee4374 500fbd7 6ee4374 500fbd7 6ee4374 500fbd7 6ee4374 500fbd7 6ee4374 500fbd7 6ee4374 500fbd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import streamlit as st
import pandas as pd
import json
from utils import read_results, preprocess_path, get_model_url
from data import Tasks, Metrics, DATASET_TASK_DICT, TASK_METRIC_DICT, DATASET_GROUPS
st.set_page_config(
page_title='Cetvel π',
layout='centered',
)
@st.cache_data
def cache_results(path):
json_results = read_results(path)
results = list()
for entry in json_results:
print(entry)
row = {
'model': entry['model']['model'],
'num_parameters': entry['model']['num_parameters'],
'url': get_model_url(entry['model']),
'architecture': entry['model']['architecture'],
'type': entry['model']['type'],
'precision': entry['model']['dtype'],
}
for result in entry['results']:
task = result['task']
metric = TASK_METRIC_DICT.get(task)
score = result.get(metric)
score = 100 * score if metric != Metrics.WER and score is not None else score
row[result['name']] = score
results.append(row)
df = pd.DataFrame(results)
for group, metadata in DATASET_GROUPS.items():
df[group] = df[metadata['datasets']].mean(axis=1)
return df
@st.cache_data
def cache_datasets(path):
path = preprocess_path(path)
with open(path, 'r') as f:
datasets = json.load(f)
for key in datasets.keys():
datasets[key]['dataset'] = key
del datasets['tr-wikihow-summ'] # FIXME: There are missing experiments.
return datasets
def create_column_configs(items):
column_configs = dict()
for key, metadata in items.items():
column_configs[key] = st.column_config.NumberColumn(
metadata.get('name', key),
help=metadata['description'],
min_value=0,
format='%2.2f'
)
return column_configs
def insert_average(df, keys):
df = df.copy(deep=True)
df['average'] = df.loc[:, [x for x in df.columns if x in keys]].mean(axis=1)
df.insert(1, 'average', df.pop('average'))
df.index += 1
return df.sort_values(by=['average'], ascending=False)
def insert_average_rank(df, keys):
df = df.copy(deep=True)
df_ranks = df.loc[:, [x for x in df.columns if x in keys]].rank(ascending = False)
df['average_rank'] = df_ranks.mean(axis=1)
df.insert(2, 'average_rank', df.pop('average_rank'))
df.index += 1
return df.sort_values(by=['average_rank'], ascending=True)
MODEL_SPEC_CONFIGS = {
'model': st.column_config.TextColumn(
'Model',
help='Large Language Model (LLM) used for the experiments.',
max_chars=120,
),
'url': st.column_config.LinkColumn(
'URL',
help='Model URL.',
display_text='Click',
),
'num_parameters': st.column_config.TextColumn(
'#params',
help='Approximate number of parameters.',
),
'type': st.column_config.TextColumn(
'Type',
help='Model type based on training objective.',
),
'average': st.column_config.NumberColumn(
'Avg.',
help='Average across task or dataset performances.',
format='%2.2f',
),
'average_rank': st.column_config.NumberColumn(
'Avg. Rank',
help='Average ranking across task or dataset performances.',
format='%2.2f',
)
}
def filter_visible_model_specs():
specs = {
'URL': ('url', 1),
'#params': ('num_parameters', 2),
'Architecture': ('architecture', 3),
'Type': ('type', 4),
'Precision': ('precision', 5),
}
visible_specs = st.multiselect(
'Select model specs to be shown in the table.',
options=sorted(specs.keys(), key=lambda x: specs[x][1]),
)
# visible_specs = sorted(visible_specs, key=lambda x: specs[x][1])
return [specs[x][0] for x in visible_specs]
def filter_by_model_spec():
pass
def filter_visible_datasets(datasets):
col1, col2 = st.columns(2)
with col1:
dataset_grouping = st.selectbox(
'Dataset Grouping',
[
'Group Datasets',
'Show All Datasets',
],
)
with col2:
filter_by_task = st.selectbox(
'Filter by Task',
[
'All',
'Understanding Tasks',
'Generation Tasks',
'Multiple Choice',
'Extractive Question Answering',
'Natural Language Inference',
'Text Classification',
'Summarization',
],
disabled=dataset_grouping == 'Group Datasets',
)
if dataset_grouping == 'Group Datasets':
return list(DATASET_GROUPS.keys())
elif dataset_grouping == 'Show All Datasets':
if filter_by_task == 'All':
return list(datasets.keys())
elif filter_by_task == 'Understanding Tasks':
this_datasets = [k for (k, v) in datasets.items() if not v['generative']]
return this_datasets
elif filter_by_task == 'Generation Tasks':
this_datasets = [k for (k, v) in datasets.items() if v['generative']]
return this_datasets
elif filter_by_task == 'Multiple Choice':
return DATASET_GROUPS['MCQA']['datasets']
elif filter_by_task == 'Extractive Question Answering':
return DATASET_GROUPS['QA']['datasets']
elif filter_by_task == 'Natural Language Inference':
return DATASET_GROUPS['NLI']['datasets']
elif filter_by_task == 'Text Classification':
return DATASET_GROUPS['TC']['datasets']
elif filter_by_task == 'Summarization':
return DATASET_GROUPS['SUM']['datasets']
def introduction():
st.title(':blue[Cetvel :straight_ruler:]')
st.subheader('A Unified Benchmark for Evaluating Turkish LLMs', anchor=False)
st.markdown('''Cetvel is an extended version of the [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) tool, specifically includes tasks/datasets for benchmarking Turkish Large Language Models (LLMs). Cetvel includes a variety of tasks curated to assess different aspects of model performance in the Turkish language. Our primary goal is to objectively evaluate the capabilities of large language models in understanding and processing Turkish. For documentation and more details about the benchmark and the experiments, you can check the [GitHub repository](https://github.com/KUIS-AI/Cetvel).''')
def main():
introduction()
results_df = cache_results('./results/zero-shot')
datasets = cache_datasets('./data/datasets.json')
dataset_column_configs = create_column_configs(datasets)
group_column_configs = create_column_configs(DATASET_GROUPS)
# score_columns = list(dataset_column_configs.keys()) + list(group_column_configs.keys())
column_configs = MODEL_SPEC_CONFIGS | group_column_configs | dataset_column_configs
visible_data_columns = sorted(filter_visible_datasets(datasets), key=str.casefold)
visible_model_columns = filter_visible_model_specs()
results_df = insert_average(results_df, visible_data_columns)
results_df = insert_average_rank(results_df, visible_data_columns)
st.dataframe(
results_df,
use_container_width=True,
hide_index=True,
column_config=column_configs,
column_order=['model', 'average', 'average_rank',] + visible_model_columns + visible_data_columns,
)
st.image('./assets/kuis-ai-logo.png', width=240)
main() |