import streamlit as st import pandas as pd from PIL import Image import base64 from io import BytesIO import numpy as np # ─── Page config ────────────────────────────────────────────────────────────── st.set_page_config(page_title="ManyICLBench Leaderboard", layout="wide") logo_image = Image.open("src/manyicl_logo.png") def encode_image(image): buffered = BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") img_data = encode_image(logo_image) st.markdown( f"""

""", unsafe_allow_html=True ) st.markdown( ''' ''', unsafe_allow_html=True ) # ─── Load data ──────────────────────────────────────────────────────────────── @st.cache_data def load_data(path): df = pd.read_csv(path) if 'Task' in df.columns: # Rename Task to Models for consistency df = df.rename(columns={'Task': 'Models'}) score_cols = ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000'] # Keep existing avg and avg.L columns # Compute rank per column (1 = best) for col in score_cols + ['avg', 'avg.L']: df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int) return df # Add evaluation metrics explanation st.markdown("## 📊 Evaluation Metrics") st.markdown(""" - **Per-length Performance**: Performance at different context lengths (1K to 128K tokens) - **avg**: Average performance across all context lengths - **avg.L**: Average performance on longer contexts (>32K tokens) Higher scores indicate better performance, with all metrics reported as percentages (0-100). Red indicates performance improvement compared to 1k. Blue indicates performance downgrade compared to 1k. A darker color means higher improvement or downgrade. """) def display_table(df, cols): # Precompute max values for Avg and Avg.L max_avg = df['avg'].max() max_avg_l = df['avg.L'].max() # Build raw HTML table html = "" # Format header labels html += "" for col in cols: style = "padding:6px;" label = "" if col in ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000']: # Convert to K format val = int(col) // 1000 label = f"{val}K" else: label = col.title() # Capitalize first letter if col in ["Model", "Models"]: style += " width: 15%;" html += f"" html += "" # rows for _, row in df.iterrows(): html += "" for col in cols: val = row[col] if col in ["Model", "Models"]: html += f"" else: # Format value val_str = f"{val:.1f}" if isinstance(val, (float, np.float64)) else val # Determine if this column should be colored if col in ['1000', 'avg', 'avg.L']: # No coloring for these columns, but add bolding for max values bold = "" if (col == 'avg' and val == max_avg) or \ (col == 'avg.L' and val == max_avg_l): bold = "font-weight:bold;" style = f"padding:6px; border: 1px solid #444; {bold}" else: # Calculate relative improvement from 1k baseline baseline = float(row['1000']) if baseline != 0: relative_change = float(val) / baseline - 1 # -1 to center at 0 # Clamp the change to a reasonable range for color scaling clamped_change = max(min(relative_change, 1.5), -0.5) # Normalize to 0-1 range where 0.5 is the neutral point (no change) if clamped_change < 0: # Map [-0.5, 0) to [0, 0.5) norm = clamped_change + 0.5 else: # Map [0, 1.5] to [0.5, 1.0] norm = 0.5 + (clamped_change / 3.0) # Color interpolation: # norm = 0 -> blue (100, 149, 237) # norm = 0.5 -> white (255, 255, 255) # norm = 1 -> red (220, 20, 60) if norm < 0.5: # Blue to White # Interpolate from blue to white factor = norm * 2 # 0 to 1 r = int(100 + (255 - 100) * factor) g = int(149 + (255 - 149) * factor) b = int(237 + (255 - 237) * factor) else: # White to Red # Interpolate from white to red factor = (norm - 0.5) * 2 # 0 to 1 r = int(255 - (255 - 220) * factor) g = int(255 - (255 - 20) * factor) b = int(255 - (255 - 60) * factor) style = f"background-color:rgba({r},{g},{b},0.8); padding:6px; border: 1px solid #444;" else: style = "padding:6px; border: 1px solid #444;" html += f"" html += "" html += "

{label}
{val}	{val_str}

" st.markdown(html, unsafe_allow_html=True) # Display Retrieval table st.markdown("## SSL Tasks") st.markdown("Similar-sample Learning tasks require models to learn from a small set of similar demostration, therefore evaluating models' ability to retrieve similar samples.") df = load_data("src/Retrieval_full_200.csv") cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"] display_table(df, cols) # Display Global Context Understanding table st.markdown("## ASL Tasks") st.markdown("All-Sample Learning tasks require models to learn from all the demostrations, therefore evaluating models' ability to understand the global context.") df = load_data("src/Global Context Understanding_full_200.csv") cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"] display_table(df, cols) st.markdown("## 📚 Abstract") st.write( """ Many-shot in-context learning (ICL) has emerged as a unique setup to both utilize and test the ability of large language models to handle long context.This paper delves into long-context language model (LCLM) evaluation through many-shot ICL. We first ask: what types of ICL tasks benefit from additional demonstrations, and how effective are they in evaluating LCLMs? We find that classification and summarization tasks show performance improvements with additional demonstrations, while translation and reasoning tasks do not exhibit clear trends. Next, we investigate the extent to which different tasks necessitate retrieval versus global context understanding. We develop metrics to categorize ICL tasks into two groups: (i) similar-sample learning (**SSL**): tasks where retrieval of the most similar examples is sufficient for good performance, and (ii) all-sample learning (**ASL**): tasks that necessitate a deeper comprehension of all examples in the prompt. Lastly, we introduce a new many-shot ICL benchmark built on existing ICL tasks, ManyICLBench, to characterize model's ability on both fronts and benchmark 12 LCLMs using ManyICLBench. We find that while state-of-the-art models demonstrate good performance up to 64k tokens in SSL tasks, many models experience significant performance drops at only 16k tokens in ASL tasks. """ ) st.markdown("## Dataset Details") st.markdown(""" | **Dataset** | **Task Category** | **Avg. Tokens / Shot** | **Max # of Shots** | **# of Tasks** | | :--- | :--- | :--- | :--- | :--- | | BANKING77 | Intent Classification | 13.13 | 5386 | 1 | | GoEmotions | Emotion Classification | 15.85 | 5480 | 1 | | DialogRE | Relation Classification | 233.27 | 395 | 1 | | TREC | Question Classification | 11.25 | 6272 | 1 | | CLINC150 | Intent Classification | 8.95 | 7252 | 1 | | MATH | Math reasoning | [185.52, 407.90] | [286, 653] | 4 | | GSM8K | Math reasoning | 55.78 | 784 | 1 | | BBH | Reasoning | [48.27, 243.01] | [406, 2660] | 4 | | GPQA | MQ - Science | [183.55, 367.02] | [314, 580] | 1 | | ARC | MQ - Science | [61.54, 61.54] | [1997, 2301] | 2 | | XLSUM | New Summarization | 621.32 | 220 | 1 | GPT-4o tokenizer is used to calculate # of tokens. Max # of shots is the number of shots can be fitted into the 128k context window. For datasets that have multiple subtasks, we list the range for each value. **ASL Tasks**: banking77, dialogRE, TREC, CLINC150, and BBH_geometric_shapes **SSL Tasks**: GSM8K, MATH tasks, GSM8K, XLSUM, GPQA_cot, ARC_challenge, BBH-dyck_languages, BBH-salient_translation_error_detection, and BBH-word_sorting. """) st.markdown('## 🤖 Submit Your Model') st.write( """ 👉 You can submit your model through the following link: [https://forms.gle/eWjzPDusDJSbXCCT7](https://forms.gle/eWjzPDusDJSbXCCT7) """ ) st.markdown("## 📚 Citation") st.write(""" ```bibtex @article{zou2025manyshotincontextlearninglongcontext, title={On Many-Shot In-Context Learning for Long-Context Evaluation}, author={Kaijian Zou and Muhammad Khalifa and Lu Wang}, journal={arXiv preprint arXiv:2411.07130}, year={2025} } ``` """)