import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
import numpy as np
# ─── Page config ──────────────────────────────────────────────────────────────
st.set_page_config(page_title="ManyICLBench Leaderboard", layout="wide")
logo_image = Image.open("src/manyicl_logo.png")
def encode_image(image):
buffered = BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
img_data = encode_image(logo_image)
st.markdown(
f"""
""",
unsafe_allow_html=True
)
st.markdown(
'''
''',
unsafe_allow_html=True
)
# ─── Load data ────────────────────────────────────────────────────────────────
@st.cache_data
def load_data(path):
df = pd.read_csv(path)
if 'Task' in df.columns: # Rename Task to Models for consistency
df = df.rename(columns={'Task': 'Models'})
score_cols = ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000']
# Keep existing avg and avg.L columns
# Compute rank per column (1 = best)
for col in score_cols + ['avg', 'avg.L']:
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
return df
# Add evaluation metrics explanation
st.markdown("## 📊 Evaluation Metrics")
st.markdown("""
- **Per-length Performance**: Performance at different context lengths (1K to 128K tokens)
- **avg**: Average performance across all context lengths
- **avg.L**: Average performance on longer contexts (>32K tokens)
Higher scores indicate better performance, with all metrics reported as percentages (0-100).
Red indicates performance improvement compared to 1k. Blue indicates performance downgrade compared to 1k. A darker color means higher improvement or downgrade.
""")
def display_table(df, cols):
# Precompute max values for Avg and Avg.L
max_avg = df['avg'].max()
max_avg_l = df['avg.L'].max()
# Build raw HTML table
html = ""
# Format header labels
html += ""
for col in cols:
style = "padding:6px;"
label = ""
if col in ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000']:
# Convert to K format
val = int(col) // 1000
label = f"{val}K"
else:
label = col.title() # Capitalize first letter
if col in ["Model", "Models"]:
style += " width: 15%;"
html += f"{label} | "
html += "
"
# rows
for _, row in df.iterrows():
html += ""
for col in cols:
val = row[col]
if col in ["Model", "Models"]:
html += f"{val} | "
else:
# Format value
val_str = f"{val:.1f}" if isinstance(val, (float, np.float64)) else val
# Determine if this column should be colored
if col in ['1000', 'avg', 'avg.L']:
# No coloring for these columns, but add bolding for max values
bold = ""
if (col == 'avg' and val == max_avg) or \
(col == 'avg.L' and val == max_avg_l):
bold = "font-weight:bold;"
style = f"padding:6px; border: 1px solid #444; {bold}"
else:
# Calculate relative improvement from 1k baseline
baseline = float(row['1000'])
if baseline != 0:
relative_change = float(val) / baseline - 1 # -1 to center at 0
# Clamp the change to a reasonable range for color scaling
clamped_change = max(min(relative_change, 1.5), -0.5)
# Normalize to 0-1 range where 0.5 is the neutral point (no change)
if clamped_change < 0:
# Map [-0.5, 0) to [0, 0.5)
norm = clamped_change + 0.5
else:
# Map [0, 1.5] to [0.5, 1.0]
norm = 0.5 + (clamped_change / 3.0)
# Color interpolation:
# norm = 0 -> blue (100, 149, 237)
# norm = 0.5 -> white (255, 255, 255)
# norm = 1 -> red (220, 20, 60)
if norm < 0.5: # Blue to White
# Interpolate from blue to white
factor = norm * 2 # 0 to 1
r = int(100 + (255 - 100) * factor)
g = int(149 + (255 - 149) * factor)
b = int(237 + (255 - 237) * factor)
else: # White to Red
# Interpolate from white to red
factor = (norm - 0.5) * 2 # 0 to 1
r = int(255 - (255 - 220) * factor)
g = int(255 - (255 - 20) * factor)
b = int(255 - (255 - 60) * factor)
style = f"background-color:rgba({r},{g},{b},0.8); padding:6px; border: 1px solid #444;"
else:
style = "padding:6px; border: 1px solid #444;"
html += f"{val_str} | "
html += "
"
html += "
"
st.markdown(html, unsafe_allow_html=True)
# Display Retrieval table
st.markdown("## SSL Tasks")
st.markdown("Similar-sample Learning tasks require models to learn from a small set of similar demostration, therefore evaluating models' ability to retrieve similar samples.")
df = load_data("src/Retrieval_full_200.csv")
cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"]
display_table(df, cols)
# Display Global Context Understanding table
st.markdown("## ASL Tasks")
st.markdown("All-Sample Learning tasks require models to learn from all the demostrations, therefore evaluating models' ability to understand the global context.")
df = load_data("src/Global Context Understanding_full_200.csv")
cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"]
display_table(df, cols)
st.markdown("## 📚 Abstract")
st.write(
"""
Many-shot in-context learning (ICL) has emerged as a unique setup to both utilize and test the ability of large language models to handle long context.This paper delves into long-context language model (LCLM) evaluation through many-shot ICL. We first ask: what types of ICL tasks benefit from additional demonstrations, and how effective are they in evaluating LCLMs?
We find that classification and summarization tasks show performance improvements with additional demonstrations, while translation and reasoning tasks do not exhibit clear trends.
Next, we investigate the extent to which different tasks necessitate retrieval versus global context understanding.
We develop metrics to categorize ICL tasks into two groups: (i) similar-sample learning (**SSL**): tasks where retrieval of the most similar examples is sufficient for good performance, and (ii) all-sample learning (**ASL**): tasks that necessitate a deeper comprehension of all examples in the prompt.
Lastly, we introduce a new many-shot ICL benchmark built on existing ICL tasks, ManyICLBench, to characterize model's ability on both fronts and benchmark 12 LCLMs using ManyICLBench. We find that while state-of-the-art models demonstrate good performance up to 64k tokens in SSL tasks, many models experience significant performance drops at only 16k tokens in ASL tasks.
"""
)
st.markdown("## Dataset Details")
st.markdown("""
| **Dataset** | **Task Category** | **Avg. Tokens / Shot** | **Max # of Shots** | **# of Tasks** |
| :--- | :--- | :--- | :--- | :--- |
| BANKING77 | Intent Classification | 13.13 | 5386 | 1 |
| GoEmotions | Emotion Classification | 15.85 | 5480 | 1 |
| DialogRE | Relation Classification | 233.27 | 395 | 1 |
| TREC | Question Classification | 11.25 | 6272 | 1 |
| CLINC150 | Intent Classification | 8.95 | 7252 | 1 |
| MATH | Math reasoning | [185.52, 407.90] | [286, 653] | 4 |
| GSM8K | Math reasoning | 55.78 | 784 | 1 |
| BBH | Reasoning | [48.27, 243.01] | [406, 2660] | 4 |
| GPQA | MQ - Science | [183.55, 367.02] | [314, 580] | 1 |
| ARC | MQ - Science | [61.54, 61.54] | [1997, 2301] | 2 |
| XLSUM | New Summarization | 621.32 | 220 | 1 |
GPT-4o tokenizer is used to calculate # of tokens. Max # of shots is the number of shots can be fitted into the 128k context window. For datasets that have multiple subtasks, we list the range for each value.
**ASL Tasks**: banking77, dialogRE, TREC, CLINC150, and BBH_geometric_shapes
**SSL Tasks**: GSM8K, MATH tasks, GSM8K, XLSUM, GPQA_cot, ARC_challenge, BBH-dyck_languages, BBH-salient_translation_error_detection, and BBH-word_sorting.
""")
st.markdown('## 🤖 Submit Your Model')
st.write(
"""
👉 You can submit your model through the following link: [https://forms.gle/eWjzPDusDJSbXCCT7](https://forms.gle/eWjzPDusDJSbXCCT7)
"""
)
st.markdown("## 📚 Citation")
st.write("""
```bibtex
@article{zou2025manyshotincontextlearninglongcontext,
title={On Many-Shot In-Context Learning for Long-Context Evaluation},
author={Kaijian Zou and Muhammad Khalifa and Lu Wang},
journal={arXiv preprint arXiv:2411.07130},
year={2025}
}
```
""")