Spaces:
Running
Running
Hannah
commited on
Commit
·
ff17adc
1
Parent(s):
cce0ed9
Revert "initial"
Browse filesThis reverts commit cce0ed9c2ee4e552d2f504d06f28ed0a29c10cb8.
- README copy.md +0 -75
- api.py +0 -250
- hf.svg +0 -10
- requirements.txt +0 -30
- run.py +0 -280
- run_all.py +0 -68
- style.css +0 -44
README copy.md
DELETED
@@ -1,75 +0,0 @@
|
|
1 |
-
# LLM Leaderboard Demo
|
2 |
-
|
3 |
-
This demo showcases a modern LLM leaderboard application built with Gradio and FastAPI.
|
4 |
-
|
5 |
-
## Features
|
6 |
-
|
7 |
-
- Displays a sortable, filterable table of LLM models and their performance
|
8 |
-
- Shows scores across multiple benchmarks
|
9 |
-
- External links to model documentation for both open and closed models
|
10 |
-
- Dynamic filtering by model performance categories
|
11 |
-
- Real-time data refresh functionality
|
12 |
-
- Last-updated timestamp display
|
13 |
-
- Automatic startup of both API and frontend
|
14 |
-
|
15 |
-
## Installation
|
16 |
-
|
17 |
-
First, install the required dependencies:
|
18 |
-
|
19 |
-
```bash
|
20 |
-
pip install -r requirements.txt
|
21 |
-
```
|
22 |
-
|
23 |
-
## Usage
|
24 |
-
|
25 |
-
There are two ways to run the application:
|
26 |
-
|
27 |
-
### Option 1: Using run.py (all-in-one)
|
28 |
-
|
29 |
-
```bash
|
30 |
-
python run.py
|
31 |
-
```
|
32 |
-
|
33 |
-
This will start both the FastAPI backend and Gradio interface in a single process.
|
34 |
-
|
35 |
-
### Option 2: Using run_all.py (separate processes)
|
36 |
-
|
37 |
-
```bash
|
38 |
-
python run_all.py
|
39 |
-
```
|
40 |
-
|
41 |
-
This runs the API server and Gradio interface in separate processes, making it easier to debug each component.
|
42 |
-
|
43 |
-
With either option, the application will be available at:
|
44 |
-
- Frontend: http://localhost:7860
|
45 |
-
- API: http://localhost:8000
|
46 |
-
|
47 |
-
## Architecture
|
48 |
-
|
49 |
-
The application consists of two main components:
|
50 |
-
|
51 |
-
1. **FastAPI Backend** (`api.py`):
|
52 |
-
- Provides API endpoints for leaderboard data
|
53 |
-
- Caches data to improve performance
|
54 |
-
- Handles data processing and filtering
|
55 |
-
|
56 |
-
2. **Gradio Frontend** (`run.py`):
|
57 |
-
- Creates an interactive web interface
|
58 |
-
- Fetches data from the API
|
59 |
-
- Provides filtering and search capabilities
|
60 |
-
|
61 |
-
## API Endpoints
|
62 |
-
|
63 |
-
- `GET /api/leaderboard` - Get the full leaderboard data
|
64 |
-
- `GET /api/leaderboard?refresh=true` - Force refresh data from source
|
65 |
-
- `GET /api/models` - Get a list of all model names
|
66 |
-
- `GET /api/model/{model_name}` - Get details for a specific model
|
67 |
-
- `GET /api/filters` - Get counts for different filter categories
|
68 |
-
|
69 |
-
## Customization
|
70 |
-
|
71 |
-
You can modify the appearance by editing the CSS in `run.py`. The application uses Gradio's theme system for styling.
|
72 |
-
|
73 |
-
## License
|
74 |
-
|
75 |
-
This project is licensed under the MIT License.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api.py
DELETED
@@ -1,250 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from pathlib import Path
|
3 |
-
from typing import Dict, List, Optional
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
import requests
|
7 |
-
from fastapi import FastAPI, HTTPException, Query
|
8 |
-
from fastapi.middleware.cors import CORSMiddleware
|
9 |
-
from pydantic import BaseModel
|
10 |
-
|
11 |
-
|
12 |
-
class LeaderboardModel(BaseModel):
|
13 |
-
model_name: str
|
14 |
-
type: str
|
15 |
-
model_link: Optional[str] = None
|
16 |
-
scores: Dict[str, float]
|
17 |
-
co2_cost: Optional[float] = None
|
18 |
-
|
19 |
-
class LeaderboardData(BaseModel):
|
20 |
-
models: List[LeaderboardModel]
|
21 |
-
updated_at: str
|
22 |
-
|
23 |
-
app = FastAPI(
|
24 |
-
title="LLM Leaderboard API",
|
25 |
-
description="API for serving Open LLM Leaderboard data",
|
26 |
-
version="1.0.0"
|
27 |
-
)
|
28 |
-
|
29 |
-
# Add CORS middleware to allow requests from your Gradio app
|
30 |
-
app.add_middleware(
|
31 |
-
CORSMiddleware,
|
32 |
-
allow_origins=["*"], # For production, specify your exact frontend URL
|
33 |
-
allow_credentials=True,
|
34 |
-
allow_methods=["*"],
|
35 |
-
allow_headers=["*"],
|
36 |
-
)
|
37 |
-
|
38 |
-
# Cache for leaderboard data
|
39 |
-
cached_data = None
|
40 |
-
cache_file = Path("leaderboard_cache.json")
|
41 |
-
|
42 |
-
def fetch_external_leaderboard_data(refresh: bool = False) -> Optional[Dict]:
|
43 |
-
"""
|
44 |
-
Fetch leaderboard data from external sources like HuggingFace.
|
45 |
-
Uses local cache if available and refresh is False.
|
46 |
-
"""
|
47 |
-
global cached_data
|
48 |
-
|
49 |
-
if not refresh and cached_data:
|
50 |
-
return cached_data
|
51 |
-
|
52 |
-
if not refresh and cache_file.exists():
|
53 |
-
try:
|
54 |
-
with open(cache_file) as f:
|
55 |
-
cached_data = json.load(f)
|
56 |
-
return cached_data
|
57 |
-
except:
|
58 |
-
pass # Fall back to fetching if cache read fails
|
59 |
-
|
60 |
-
try:
|
61 |
-
# Try different endpoints that might contain leaderboard data
|
62 |
-
endpoints = [
|
63 |
-
"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/raw/main/leaderboard_data.json",
|
64 |
-
"https://huggingface.co/api/spaces/HuggingFaceH4/open_llm_leaderboard/api/get_results",
|
65 |
-
]
|
66 |
-
|
67 |
-
for url in endpoints:
|
68 |
-
response = requests.get(url)
|
69 |
-
if response.status_code == 200:
|
70 |
-
data = response.json()
|
71 |
-
cached_data = data
|
72 |
-
with open(cache_file, "w") as f:
|
73 |
-
json.dump(data, f)
|
74 |
-
return data
|
75 |
-
|
76 |
-
# If all endpoints fail, return None
|
77 |
-
return None
|
78 |
-
except Exception as e:
|
79 |
-
print(f"Error fetching external leaderboard data: {e}")
|
80 |
-
return None
|
81 |
-
|
82 |
-
def generate_sample_data() -> Dict:
|
83 |
-
"""
|
84 |
-
Generate sample leaderboard data when external data can't be fetched.
|
85 |
-
"""
|
86 |
-
models = [
|
87 |
-
{"model_name": "meta-llama/llama-3-70b-instruct", "type": "open"},
|
88 |
-
{"model_name": "mistralai/Mistral-7B-Instruct-v0.3", "type": "open"},
|
89 |
-
{"model_name": "google/gemma-7b-it", "type": "open"},
|
90 |
-
{"model_name": "Qwen/Qwen2-7B-Instruct", "type": "open"},
|
91 |
-
{"model_name": "anthropic/claude-3-opus", "type": "closed", "external_link": "https://www.anthropic.com/claude"},
|
92 |
-
{"model_name": "OpenAI/gpt-4o", "type": "closed", "external_link": "https://openai.com/gpt-4"},
|
93 |
-
{"model_name": "01-ai/Yi-1.5-34B-Chat", "type": "open"},
|
94 |
-
{"model_name": "google/gemma-2b", "type": "open"},
|
95 |
-
{"model_name": "microsoft/phi-3-mini-4k-instruct", "type": "open"},
|
96 |
-
{"model_name": "microsoft/phi-3-mini-128k-instruct", "type": "open"},
|
97 |
-
{"model_name": "stabilityai/stable-beluga-7b", "type": "open"},
|
98 |
-
{"model_name": "togethercomputer/RedPajama-INCITE-7B-Instruct", "type": "open"},
|
99 |
-
{"model_name": "databricks/dbrx-instruct", "type": "closed", "external_link": "https://www.databricks.com/product/machine-learning/large-language-models"},
|
100 |
-
{"model_name": "mosaicml/mpt-7b-instruct", "type": "open"},
|
101 |
-
{"model_name": "01-ai/Yi-1.5-9B-Chat", "type": "open"},
|
102 |
-
{"model_name": "anthropic/claude-3-sonnet", "type": "closed", "external_link": "https://www.anthropic.com/claude"},
|
103 |
-
{"model_name": "cohere/command-r-plus", "type": "closed", "external_link": "https://cohere.com/models/command-r-plus"},
|
104 |
-
{"model_name": "meta-llama/llama-3-8b-instruct", "type": "open"}
|
105 |
-
]
|
106 |
-
|
107 |
-
np.random.seed(42) # For reproducibility
|
108 |
-
|
109 |
-
model_data = []
|
110 |
-
for model_info in models:
|
111 |
-
model_name = model_info["model_name"]
|
112 |
-
model_type = model_info["type"]
|
113 |
-
external_link = model_info.get("external_link", None)
|
114 |
-
|
115 |
-
# Generate random scores
|
116 |
-
average = round(np.random.uniform(40, 90), 2)
|
117 |
-
ifeval = round(np.random.uniform(30, 90), 2)
|
118 |
-
bbhi = round(np.random.uniform(40, 85), 2)
|
119 |
-
math = round(np.random.uniform(20, 80), 2)
|
120 |
-
gpqa = round(np.random.uniform(10, 70), 2)
|
121 |
-
mujb = round(np.random.uniform(10, 70), 2)
|
122 |
-
mmlu = round(np.random.uniform(40, 85), 2)
|
123 |
-
co2_cost = round(np.random.uniform(1, 100), 2)
|
124 |
-
|
125 |
-
# If it's an open model, it should have a link to Hugging Face
|
126 |
-
model_link = None
|
127 |
-
if external_link:
|
128 |
-
model_link = external_link
|
129 |
-
elif "/" in model_name:
|
130 |
-
model_link = f"https://huggingface.co/{model_name}"
|
131 |
-
else:
|
132 |
-
model_link = f"https://huggingface.co/models?search={model_name}"
|
133 |
-
|
134 |
-
model_data.append({
|
135 |
-
"model_name": model_name,
|
136 |
-
"type": model_type,
|
137 |
-
"model_link": model_link,
|
138 |
-
"scores": {
|
139 |
-
"average": average,
|
140 |
-
"ifeval": ifeval,
|
141 |
-
"bbhi": bbhi,
|
142 |
-
"math": math,
|
143 |
-
"gpqa": gpqa,
|
144 |
-
"mujb": mujb,
|
145 |
-
"mmlu": mmlu
|
146 |
-
},
|
147 |
-
"co2_cost": co2_cost
|
148 |
-
})
|
149 |
-
|
150 |
-
# Sort by average score
|
151 |
-
model_data.sort(key=lambda x: x["scores"]["average"], reverse=True)
|
152 |
-
|
153 |
-
# Create the final data structure
|
154 |
-
from datetime import datetime
|
155 |
-
leaderboard_data = {
|
156 |
-
"models": model_data,
|
157 |
-
"updated_at": datetime.now().isoformat()
|
158 |
-
}
|
159 |
-
|
160 |
-
return leaderboard_data
|
161 |
-
|
162 |
-
@app.get("/")
|
163 |
-
def read_root():
|
164 |
-
return {"message": "Welcome to the LLM Leaderboard API"}
|
165 |
-
|
166 |
-
@app.get("/api/leaderboard", response_model=LeaderboardData)
|
167 |
-
def get_leaderboard(refresh: bool = Query(False, description="Force refresh data from source")):
|
168 |
-
"""
|
169 |
-
Get the full leaderboard data.
|
170 |
-
If refresh is True, force fetch from source instead of using cache.
|
171 |
-
"""
|
172 |
-
external_data = fetch_external_leaderboard_data(refresh=refresh)
|
173 |
-
|
174 |
-
if external_data:
|
175 |
-
# Process external data to match our expected format
|
176 |
-
try:
|
177 |
-
# Here you would transform the external data to match LeaderboardData model
|
178 |
-
# This is a simplified example - you'd need to adapt this to the actual structure
|
179 |
-
return external_data
|
180 |
-
except Exception as e:
|
181 |
-
print(f"Error processing external data: {e}")
|
182 |
-
|
183 |
-
# Fall back to sample data if external data can't be processed
|
184 |
-
return generate_sample_data()
|
185 |
-
|
186 |
-
@app.get("/api/models", response_model=List[str])
|
187 |
-
def get_models():
|
188 |
-
"""Get a list of all model names in the leaderboard"""
|
189 |
-
data = fetch_external_leaderboard_data() or generate_sample_data()
|
190 |
-
return [model["model_name"] for model in data["models"]]
|
191 |
-
|
192 |
-
@app.get("/api/model/{model_name}", response_model=LeaderboardModel)
|
193 |
-
def get_model_details(model_name: str):
|
194 |
-
"""Get detailed information about a specific model"""
|
195 |
-
data = fetch_external_leaderboard_data() or generate_sample_data()
|
196 |
-
|
197 |
-
for model in data["models"]:
|
198 |
-
if model["model_name"] == model_name:
|
199 |
-
return model
|
200 |
-
|
201 |
-
raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
|
202 |
-
|
203 |
-
@app.get("/api/filters")
|
204 |
-
def get_filter_counts():
|
205 |
-
"""
|
206 |
-
Get counts for different filter categories to display in the UI.
|
207 |
-
This matches what's shown in the 'Quick Filters' section of the leaderboard.
|
208 |
-
"""
|
209 |
-
data = fetch_external_leaderboard_data() or generate_sample_data()
|
210 |
-
|
211 |
-
# Count models by different categories
|
212 |
-
edge_count = 0
|
213 |
-
consumer_count = 0
|
214 |
-
midrange_count = 0
|
215 |
-
gpu_rich_count = 0
|
216 |
-
official_count = 0
|
217 |
-
|
218 |
-
for model in data["models"]:
|
219 |
-
# Edge devices (typically small models)
|
220 |
-
if "scores" in model and model["scores"].get("average", 0) < 45:
|
221 |
-
edge_count += 1
|
222 |
-
|
223 |
-
# Consumer (moderate size/performance)
|
224 |
-
if "scores" in model and 45 <= model["scores"].get("average", 0) < 55:
|
225 |
-
consumer_count += 1
|
226 |
-
|
227 |
-
# Mid-range
|
228 |
-
if "scores" in model and 55 <= model["scores"].get("average", 0) < 65:
|
229 |
-
midrange_count += 1
|
230 |
-
|
231 |
-
# GPU-rich (high-end models)
|
232 |
-
if "scores" in model and model["scores"].get("average", 0) >= 65:
|
233 |
-
gpu_rich_count += 1
|
234 |
-
|
235 |
-
# Official providers
|
236 |
-
# This is just a placeholder logic - adapt to your actual criteria
|
237 |
-
if "/" not in model["model_name"] or model["model_name"].startswith("meta/") or model["model_name"].startswith("google/"):
|
238 |
-
official_count += 1
|
239 |
-
|
240 |
-
return {
|
241 |
-
"edge_devices": edge_count,
|
242 |
-
"consumers": consumer_count,
|
243 |
-
"midrange": midrange_count,
|
244 |
-
"gpu_rich": gpu_rich_count,
|
245 |
-
"official_providers": official_count
|
246 |
-
}
|
247 |
-
|
248 |
-
if __name__ == "__main__":
|
249 |
-
import uvicorn
|
250 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf.svg
DELETED
requirements.txt
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
aiofiles>=22.0,<24.0
|
2 |
-
anyio>=3.0,<5.0
|
3 |
-
audioop-lts<1.0; python_version >= "3.13" #it provides support for 'audioop' module removed in latest python version used by pydub
|
4 |
-
fastapi>=0.95.0
|
5 |
-
ffmpy
|
6 |
-
groovy~=0.1
|
7 |
-
gradio>=5.0.0
|
8 |
-
httpx>=0.24.1
|
9 |
-
huggingface_hub>=0.28.1
|
10 |
-
Jinja2<4.0
|
11 |
-
markupsafe>=2.0,<4.0
|
12 |
-
numpy>=1.20.0
|
13 |
-
orjson~=3.0
|
14 |
-
packaging
|
15 |
-
pandas>=1.3.0
|
16 |
-
pillow>=8.0,<12.0
|
17 |
-
pydantic>=1.10.0
|
18 |
-
python-multipart>=0.0.18 # required for fastapi forms.
|
19 |
-
pydub
|
20 |
-
pyyaml>=5.0,<7.0
|
21 |
-
ruff>=0.9.3; sys.platform != 'emscripten' # needed here for custom component docs generation
|
22 |
-
safehttpx>=0.1.6,<0.2.0
|
23 |
-
semantic_version~=2.0
|
24 |
-
starlette>=0.40.0,<1.0; sys.platform != 'emscripten'
|
25 |
-
tomlkit>=0.12.0,<0.14.0
|
26 |
-
typer>=0.12,<1.0; sys.platform != 'emscripten'
|
27 |
-
typing_extensions~=4.0
|
28 |
-
urllib3~=2.0; sys.platform == 'emscripten' # urllib3 is used for Lite support. Version spec can be omitted because urllib3==2.1.0 is prebuilt for Pyodide and urllib>=2.2.0 supports Pyodide as well.
|
29 |
-
uvicorn>=0.21.0
|
30 |
-
requests>=2.28.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run.py
DELETED
@@ -1,280 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import requests
|
3 |
-
from urllib.parse import quote
|
4 |
-
import subprocess
|
5 |
-
import os
|
6 |
-
import time
|
7 |
-
import sys
|
8 |
-
from datetime import datetime
|
9 |
-
|
10 |
-
import gradio as gr
|
11 |
-
|
12 |
-
def style_dataframe(df):
|
13 |
-
if len(df) == 0:
|
14 |
-
return df
|
15 |
-
|
16 |
-
# Define the columns to highlight based on the screenshot
|
17 |
-
highlight_cols = ["Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO"]
|
18 |
-
|
19 |
-
# Initialize the styler
|
20 |
-
styled = df.style
|
21 |
-
|
22 |
-
# Function to create gradient background based on value
|
23 |
-
def highlight_green(val):
|
24 |
-
try:
|
25 |
-
# Extract numeric value from string (remove % if present)
|
26 |
-
val_float = float(str(val).replace('%', '').replace(' kg', ''))
|
27 |
-
|
28 |
-
# Create gradient background filling based on the value percentage
|
29 |
-
# Use the exact colors from the example
|
30 |
-
return f'background: linear-gradient(90deg, rgba(46, 125, 50, 0.5) {val_float}%, rgba(46, 125, 50, 0.1) {val_float}%); color: white;'
|
31 |
-
except:
|
32 |
-
return 'background-color: #121212; color: white;'
|
33 |
-
|
34 |
-
# Apply the highlighting to performance metric columns
|
35 |
-
for col in highlight_cols:
|
36 |
-
styled = styled.applymap(highlight_green, subset=[col])
|
37 |
-
|
38 |
-
styled = styled.set_properties(
|
39 |
-
subset=["Model"],
|
40 |
-
**{'color': '#4da6ff'}
|
41 |
-
)
|
42 |
-
|
43 |
-
return styled
|
44 |
-
|
45 |
-
def increment_counter(counter):
|
46 |
-
return counter + 1
|
47 |
-
|
48 |
-
def handle_select(evt: gr.SelectData, counter):
|
49 |
-
return counter + 1, f"{evt.index}", f"{evt.value}"
|
50 |
-
|
51 |
-
def start_api_server():
|
52 |
-
api_process = subprocess.Popen(
|
53 |
-
[sys.executable, "api.py"],
|
54 |
-
cwd=os.path.dirname(os.path.abspath(__file__))
|
55 |
-
)
|
56 |
-
# Give the API server a moment to start
|
57 |
-
time.sleep(2)
|
58 |
-
return api_process
|
59 |
-
|
60 |
-
def apply_filters(filters, models_data):
|
61 |
-
if not filters or len(filters) == 0:
|
62 |
-
# No filters selected, return all data
|
63 |
-
return models_data
|
64 |
-
|
65 |
-
filtered_models = []
|
66 |
-
|
67 |
-
for model in models_data:
|
68 |
-
scores = model.get("scores", {})
|
69 |
-
average_score = scores.get("average", 0)
|
70 |
-
model_name = model.get("model_name", "")
|
71 |
-
|
72 |
-
# Check which filters are selected and apply them
|
73 |
-
should_include = False
|
74 |
-
for filter_option in filters:
|
75 |
-
if "Edge Devices" in filter_option and average_score < 45 or "Consumers" in filter_option and 45 <= average_score < 55 or "Mid-range" in filter_option and 55 <= average_score < 65 or "GPU-rich" in filter_option and average_score >= 65 or "Official Providers" in filter_option and ("/" not in model_name or
|
76 |
-
model_name.startswith("meta/") or
|
77 |
-
model_name.startswith("google/") or
|
78 |
-
model_name.startswith("openai/") or
|
79 |
-
model_name.startswith("microsoft/")):
|
80 |
-
should_include = True
|
81 |
-
break
|
82 |
-
|
83 |
-
if should_include:
|
84 |
-
filtered_models.append(model)
|
85 |
-
|
86 |
-
return filtered_models
|
87 |
-
|
88 |
-
def format_timestamp(timestamp_str):
|
89 |
-
try:
|
90 |
-
# Parse ISO format timestamp
|
91 |
-
|
92 |
-
# Try to parse ISO format with timezone
|
93 |
-
try:
|
94 |
-
dt = datetime.fromisoformat(timestamp_str)
|
95 |
-
except:
|
96 |
-
# Fallback for different timestamp formats
|
97 |
-
dt = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f")
|
98 |
-
|
99 |
-
# Format nicely for display
|
100 |
-
return dt.strftime("%B %d, %Y at %I:%M %p")
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Error formatting timestamp: {e}")
|
103 |
-
return timestamp_str
|
104 |
-
|
105 |
-
def create_leaderboard_data(selected_filters=None):
|
106 |
-
try:
|
107 |
-
response = requests.get("http://localhost:8000/api/leaderboard")
|
108 |
-
if response.status_code == 200:
|
109 |
-
data = response.json()
|
110 |
-
models_data = data.get("models", [])
|
111 |
-
updated_at = data.get("updated_at", "Unknown")
|
112 |
-
formatted_time = format_timestamp(updated_at)
|
113 |
-
|
114 |
-
# Apply filters if any are selected
|
115 |
-
if selected_filters:
|
116 |
-
models_data = apply_filters(selected_filters, models_data)
|
117 |
-
|
118 |
-
rows = []
|
119 |
-
for i, model in enumerate(models_data, 1):
|
120 |
-
model_name = model["model_name"]
|
121 |
-
model_type = model["type"]
|
122 |
-
scores = model["scores"]
|
123 |
-
co2_cost = model.get("co2_cost", "N/A")
|
124 |
-
|
125 |
-
# Only use green for open and red for closed
|
126 |
-
emoji = "🟢" if model_type.lower() == "open" else "🔴"
|
127 |
-
type_with_emoji = f"{emoji} {model_type.upper()}"
|
128 |
-
|
129 |
-
# Use model_link from API if available, otherwise create one
|
130 |
-
if "model_link" in model and model["model_link"]:
|
131 |
-
model_link = f"[{model_name}]({model['model_link']})"
|
132 |
-
# Format model name with link
|
133 |
-
elif "/" in model_name:
|
134 |
-
org, name = model_name.split("/", 1)
|
135 |
-
model_link = f"[{model_name}](https://huggingface.co/{quote(model_name)})"
|
136 |
-
else:
|
137 |
-
model_link = f"[{model_name}](https://huggingface.co/models?search={quote(model_name)})"
|
138 |
-
|
139 |
-
rows.append([
|
140 |
-
i, # Rank
|
141 |
-
type_with_emoji,
|
142 |
-
model_link,
|
143 |
-
f"{scores.get('average', 0):.2f}",
|
144 |
-
f"{scores.get('ifeval', 0):.2f}",
|
145 |
-
f"{scores.get('bbhi', 0):.2f}",
|
146 |
-
f"{scores.get('math', 0):.2f}",
|
147 |
-
f"{scores.get('gpqa', 0):.2f}",
|
148 |
-
f"{scores.get('mujb', 0):.2f}",
|
149 |
-
f"{scores.get('mmlu', 0):.2f}",
|
150 |
-
f"{co2_cost}" if isinstance(co2_cost, (int, float)) else co2_cost
|
151 |
-
])
|
152 |
-
|
153 |
-
df = pd.DataFrame(rows, columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
|
154 |
-
styled_df = style_dataframe(df)
|
155 |
-
return styled_df, formatted_time
|
156 |
-
else:
|
157 |
-
# Return an empty dataframe with proper columns if API fails
|
158 |
-
empty_df = pd.DataFrame(columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
|
159 |
-
return empty_df, "Unknown"
|
160 |
-
except Exception as e:
|
161 |
-
print(f"Error fetching leaderboard data: {e}")
|
162 |
-
# Return an empty dataframe with proper columns if API fails
|
163 |
-
empty_df = pd.DataFrame(columns=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"])
|
164 |
-
return empty_df, "Unknown"
|
165 |
-
|
166 |
-
def load_svg(file_path="hf.svg"):
|
167 |
-
with open(file_path) as f:
|
168 |
-
svg_content = f.read()
|
169 |
-
return svg_content
|
170 |
-
|
171 |
-
def get_filter_data():
|
172 |
-
try:
|
173 |
-
response = requests.get("http://localhost:8000/api/filters")
|
174 |
-
if response.status_code == 200:
|
175 |
-
filter_data = response.json()
|
176 |
-
return [
|
177 |
-
f"For Edge Devices · {filter_data.get('edge_devices', 0)}",
|
178 |
-
f"For Consumers · {filter_data.get('consumers', 0)}",
|
179 |
-
f"Mid-range · {filter_data.get('midrange', 0)}",
|
180 |
-
f"For the GPU-rich · {filter_data.get('gpu_rich', 0)}",
|
181 |
-
f"Only Official Providers · {filter_data.get('official_providers', 0)}"
|
182 |
-
]
|
183 |
-
else:
|
184 |
-
return [
|
185 |
-
"For Edge Devices · 0",
|
186 |
-
"For Consumers · 0",
|
187 |
-
"Mid-range · 0",
|
188 |
-
"For the GPU-rich · 0",
|
189 |
-
"Only Official Providers · 0"
|
190 |
-
]
|
191 |
-
except Exception as e:
|
192 |
-
print(f"Error fetching filter data: {e}")
|
193 |
-
return [
|
194 |
-
"For Edge Devices · 0",
|
195 |
-
"For Consumers · 0",
|
196 |
-
"Mid-range · 0",
|
197 |
-
"For the GPU-rich · 0",
|
198 |
-
"Only Official Providers · 0"
|
199 |
-
]
|
200 |
-
|
201 |
-
def refresh_leaderboard(selected_filters=None):
|
202 |
-
try:
|
203 |
-
# Request a refresh from the API
|
204 |
-
requests.get("http://localhost:8000/api/leaderboard?refresh=true")
|
205 |
-
# Get updated data
|
206 |
-
df, timestamp = create_leaderboard_data(selected_filters)
|
207 |
-
filter_choices = get_filter_data()
|
208 |
-
return df, filter_choices, f"Last updated: {timestamp}"
|
209 |
-
except Exception as e:
|
210 |
-
print(f"Error refreshing data: {e}")
|
211 |
-
return None, None, "Error refreshing data"
|
212 |
-
|
213 |
-
def update_table(filters):
|
214 |
-
df, timestamp = create_leaderboard_data(filters)
|
215 |
-
return df, f"Last updated: {timestamp}"
|
216 |
-
|
217 |
-
def load_css(file_path="style.css"):
|
218 |
-
try:
|
219 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
220 |
-
css_path = os.path.join(current_dir, file_path)
|
221 |
-
with open(css_path) as f:
|
222 |
-
css_content = f.read()
|
223 |
-
return css_content
|
224 |
-
except Exception as e:
|
225 |
-
print(f"Error loading CSS file: {e}")
|
226 |
-
# Return a basic CSS if file not found
|
227 |
-
return """
|
228 |
-
.dataframe-container {
|
229 |
-
border-radius: 8px;
|
230 |
-
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
231 |
-
}
|
232 |
-
"""
|
233 |
-
|
234 |
-
with gr.Blocks(css=load_css()) as demo:
|
235 |
-
df, timestamp = create_leaderboard_data()
|
236 |
-
|
237 |
-
with gr.Row():
|
238 |
-
svg_content = load_svg()
|
239 |
-
gr.HTML(svg_content)
|
240 |
-
gr.HTML("""
|
241 |
-
<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
242 |
-
<div class="leaderboard-title">Open LLM Leaderboard</div>
|
243 |
-
</div>
|
244 |
-
<div class="leaderboard-subtitle">Comparing Large Language Models in an open and reproducible way</div>
|
245 |
-
""")
|
246 |
-
status_text = gr.HTML(f"""<div style="text-align: center; margin-bottom: 10px;">Last updated: {timestamp}</div>""")
|
247 |
-
|
248 |
-
with gr.Row(elem_classes="filters-container"):
|
249 |
-
filter_choices = get_filter_data()
|
250 |
-
filters = gr.CheckboxGroup(
|
251 |
-
label="Quick Filters",
|
252 |
-
choices=filter_choices,
|
253 |
-
)
|
254 |
-
|
255 |
-
# Create and display the dataframe
|
256 |
-
|
257 |
-
leaderboard_table = gr.Dataframe(
|
258 |
-
value=df,
|
259 |
-
headers=["Rank", "Type", "Model", "Average", "IFEval", "BBHI", "MATH", "GPQA", "MUJB", "MMLU-PRO", "CO_Cost"],
|
260 |
-
datatype=["number", "str", "markdown", "str", "str", "str", "str", "str", "str", "str", "str"],
|
261 |
-
elem_id="leaderboard-table",
|
262 |
-
elem_classes="dataframe-container",
|
263 |
-
interactive=False,
|
264 |
-
max_height=600,
|
265 |
-
show_search="search",
|
266 |
-
show_copy_button=True,
|
267 |
-
show_fullscreen_button=True,
|
268 |
-
pinned_columns=2,
|
269 |
-
column_widths=["5%", "10%", "35%", "7%", "7%", "7%", "7%", "7%", "7%", "7%", "6%"]
|
270 |
-
)
|
271 |
-
|
272 |
-
refresh_btn = gr.Button("Refresh Data", elem_classes="refresh-btn")
|
273 |
-
refresh_btn.click(refresh_leaderboard, inputs=[filters], outputs=[leaderboard_table, filters, status_text])
|
274 |
-
|
275 |
-
filters.change(update_table, inputs=[filters], outputs=[leaderboard_table, status_text])
|
276 |
-
|
277 |
-
if __name__ == "__main__":
|
278 |
-
api_process = start_api_server()
|
279 |
-
demo.launch()
|
280 |
-
api_process.terminate()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
run_all.py
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
import subprocess
|
3 |
-
import sys
|
4 |
-
import os
|
5 |
-
import time
|
6 |
-
import signal
|
7 |
-
import atexit
|
8 |
-
|
9 |
-
def start_api_server():
|
10 |
-
api_process = subprocess.Popen(
|
11 |
-
[sys.executable, "api.py"],
|
12 |
-
cwd=os.path.dirname(os.path.abspath(__file__))
|
13 |
-
)
|
14 |
-
print("API server started (PID:", api_process.pid, ")")
|
15 |
-
return api_process
|
16 |
-
|
17 |
-
def start_gradio_app():
|
18 |
-
gradio_process = subprocess.Popen(
|
19 |
-
[sys.executable, "-c", "import gradio as gr; import run; run.demo.launch()"],
|
20 |
-
cwd=os.path.dirname(os.path.abspath(__file__))
|
21 |
-
)
|
22 |
-
print("Gradio interface started (PID:", gradio_process.pid, ")")
|
23 |
-
return gradio_process
|
24 |
-
|
25 |
-
def cleanup_processes(api_process, gradio_process):
|
26 |
-
print("\nShutting down services...")
|
27 |
-
|
28 |
-
if api_process and api_process.poll() is None:
|
29 |
-
api_process.terminate()
|
30 |
-
print("API server terminated")
|
31 |
-
|
32 |
-
if gradio_process and gradio_process.poll() is None:
|
33 |
-
gradio_process.terminate()
|
34 |
-
print("Gradio interface terminated")
|
35 |
-
|
36 |
-
def main():
|
37 |
-
api_process = start_api_server()
|
38 |
-
# Give the API server a moment to start
|
39 |
-
time.sleep(2)
|
40 |
-
|
41 |
-
gradio_process = start_gradio_app()
|
42 |
-
|
43 |
-
# Register cleanup function to be called on exit
|
44 |
-
atexit.register(cleanup_processes, api_process, gradio_process)
|
45 |
-
|
46 |
-
# Handle keyboard interrupts
|
47 |
-
def signal_handler(sig, frame):
|
48 |
-
print("\nReceived termination signal")
|
49 |
-
cleanup_processes(api_process, gradio_process)
|
50 |
-
sys.exit(0)
|
51 |
-
|
52 |
-
signal.signal(signal.SIGINT, signal_handler)
|
53 |
-
signal.signal(signal.SIGTERM, signal_handler)
|
54 |
-
|
55 |
-
print("\nLeaderboard application started!")
|
56 |
-
print("- API server running at http://localhost:8000")
|
57 |
-
print("- Gradio interface running at http://localhost:7860")
|
58 |
-
print("\nPress Ctrl+C to stop all services")
|
59 |
-
|
60 |
-
# Keep the main process running
|
61 |
-
try:
|
62 |
-
while True:
|
63 |
-
time.sleep(1)
|
64 |
-
except KeyboardInterrupt:
|
65 |
-
pass
|
66 |
-
|
67 |
-
if __name__ == "__main__":
|
68 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
style.css
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
.html-container {
|
2 |
-
text-align: center;
|
3 |
-
display: flex;
|
4 |
-
justify-content: center;
|
5 |
-
width: 100%;
|
6 |
-
}
|
7 |
-
|
8 |
-
.dataframe-container {
|
9 |
-
margin-top: 0.5rem;
|
10 |
-
margin-bottom: 0.5rem;
|
11 |
-
}
|
12 |
-
|
13 |
-
.leaderboard-title {
|
14 |
-
font-size: 1.5rem;
|
15 |
-
font-weight: bold;
|
16 |
-
margin-bottom: 0.25rem;
|
17 |
-
color: #f0f0f0;
|
18 |
-
}
|
19 |
-
|
20 |
-
.leaderboard-subtitle {
|
21 |
-
font-size: 0.9rem;
|
22 |
-
margin-bottom: 1rem;
|
23 |
-
color: #a0a0a0;
|
24 |
-
}
|
25 |
-
|
26 |
-
.filters-container fieldset {
|
27 |
-
display: flex;
|
28 |
-
flex-direction: row;
|
29 |
-
justify-content: center;
|
30 |
-
align-items: center;
|
31 |
-
gap: 0.5rem;
|
32 |
-
}
|
33 |
-
|
34 |
-
.refresh-btn {
|
35 |
-
margin-top: 0.5rem;
|
36 |
-
}
|
37 |
-
|
38 |
-
.status-container {
|
39 |
-
display: flex;
|
40 |
-
justify-content: flex-end;
|
41 |
-
font-size: 0.75rem;
|
42 |
-
color: #a0a0a0;
|
43 |
-
}
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|