Spaces:
Running
Running
Commit
Β·
bccaf50
1
Parent(s):
4b45492
update
Browse files- app.py +37 -43
- src/about.py +68 -34
- src/display/formatting.py +45 -0
- src/display/utils.py +51 -52
- src/envs.py +7 -7
- src/leaderboard/read_evals.py +123 -111
- src/populate.py +93 -51
- src/submission/check_validity.py +94 -80
- src/submission/submit.py +86 -111
app.py
CHANGED
@@ -19,10 +19,10 @@ from src.display.utils import (
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
-
|
23 |
fields,
|
24 |
-
|
25 |
-
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
@@ -68,23 +68,23 @@ def init_leaderboard(dataframe):
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
-
search_columns=[AutoEvalColumn.
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.
|
75 |
-
ColumnFilter(AutoEvalColumn.
|
76 |
ColumnFilter(
|
77 |
-
AutoEvalColumn.
|
78 |
type="slider",
|
79 |
-
min=0
|
80 |
-
max=
|
81 |
-
label="
|
82 |
),
|
83 |
ColumnFilter(
|
84 |
-
AutoEvalColumn.
|
85 |
),
|
86 |
],
|
87 |
-
bool_checkboxgroup_label="
|
88 |
interactive=False,
|
89 |
)
|
90 |
|
@@ -95,20 +95,20 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("π
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
-
with gr.TabItem("π About", elem_id="
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
-
with gr.TabItem("π Submit
|
105 |
with gr.Column():
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
|
109 |
with gr.Column():
|
110 |
with gr.Accordion(
|
111 |
-
f"β
|
112 |
open=False,
|
113 |
):
|
114 |
with gr.Row():
|
@@ -119,7 +119,7 @@ with demo:
|
|
119 |
row_count=5,
|
120 |
)
|
121 |
with gr.Accordion(
|
122 |
-
f"π
|
123 |
open=False,
|
124 |
):
|
125 |
with gr.Row():
|
@@ -131,7 +131,7 @@ with demo:
|
|
131 |
)
|
132 |
|
133 |
with gr.Accordion(
|
134 |
-
f"β³ Pending
|
135 |
open=False,
|
136 |
):
|
137 |
with gr.Row():
|
@@ -142,48 +142,42 @@ with demo:
|
|
142 |
row_count=5,
|
143 |
)
|
144 |
with gr.Row():
|
145 |
-
gr.Markdown("# βοΈβ¨ Submit
|
146 |
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
choices=[t.to_str(" : ") for t in
|
153 |
-
label="
|
154 |
multiselect=False,
|
155 |
value=None,
|
156 |
interactive=True,
|
157 |
)
|
158 |
|
159 |
with gr.Column():
|
160 |
-
|
161 |
-
choices=[i.value.name for i in
|
162 |
-
label="
|
163 |
multiselect=False,
|
164 |
-
value="
|
165 |
interactive=True,
|
166 |
)
|
167 |
-
|
168 |
-
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
-
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
|
176 |
-
submit_button = gr.Button("Submit
|
177 |
submission_result = gr.Markdown()
|
178 |
submit_button.click(
|
179 |
add_new_eval,
|
180 |
[
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
],
|
188 |
submission_result,
|
189 |
)
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
AutoEvalColumn,
|
22 |
+
LibraryType,
|
23 |
fields,
|
24 |
+
Language,
|
25 |
+
AssessmentStatus
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
+
search_columns=[AutoEvalColumn.library.name, AutoEvalColumn.license.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
+
ColumnFilter(AutoEvalColumn.library_type.name, type="checkboxgroup", label="Library types"),
|
75 |
+
ColumnFilter(AutoEvalColumn.language.name, type="checkboxgroup", label="Programming Language"),
|
76 |
ColumnFilter(
|
77 |
+
AutoEvalColumn.stars.name,
|
78 |
type="slider",
|
79 |
+
min=0,
|
80 |
+
max=50000,
|
81 |
+
label="GitHub Stars",
|
82 |
),
|
83 |
ColumnFilter(
|
84 |
+
AutoEvalColumn.availability.name, type="boolean", label="Show only active libraries", default=True
|
85 |
),
|
86 |
],
|
87 |
+
bool_checkboxgroup_label="Filter libraries",
|
88 |
interactive=False,
|
89 |
)
|
90 |
|
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
+
with gr.TabItem("π
Vulnerability Leaderboard", elem_id="vulnerability-leaderboard-tab", id=0):
|
99 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
+
with gr.TabItem("π About", elem_id="about-tab", id=2):
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
+
with gr.TabItem("π Submit Library", elem_id="submit-library-tab", id=3):
|
105 |
with gr.Column():
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
|
109 |
with gr.Column():
|
110 |
with gr.Accordion(
|
111 |
+
f"β
Completed Assessments ({len(finished_eval_queue_df)})",
|
112 |
open=False,
|
113 |
):
|
114 |
with gr.Row():
|
|
|
119 |
row_count=5,
|
120 |
)
|
121 |
with gr.Accordion(
|
122 |
+
f"π In Progress Assessments ({len(running_eval_queue_df)})",
|
123 |
open=False,
|
124 |
):
|
125 |
with gr.Row():
|
|
|
131 |
)
|
132 |
|
133 |
with gr.Accordion(
|
134 |
+
f"β³ Pending Assessment Queue ({len(pending_eval_queue_df)})",
|
135 |
open=False,
|
136 |
):
|
137 |
with gr.Row():
|
|
|
142 |
row_count=5,
|
143 |
)
|
144 |
with gr.Row():
|
145 |
+
gr.Markdown("# βοΈβ¨ Submit a library for vulnerability assessment", elem_classes="markdown-text")
|
146 |
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
+
library_name_textbox = gr.Textbox(label="Library name (org/repo format)")
|
150 |
+
library_version_textbox = gr.Textbox(label="Version", placeholder="v1.0.0")
|
151 |
+
library_type = gr.Dropdown(
|
152 |
+
choices=[t.to_str(" : ") for t in LibraryType if t != LibraryType.Unknown],
|
153 |
+
label="Library type",
|
154 |
multiselect=False,
|
155 |
value=None,
|
156 |
interactive=True,
|
157 |
)
|
158 |
|
159 |
with gr.Column():
|
160 |
+
language = gr.Dropdown(
|
161 |
+
choices=[i.value.name for i in Language if i != Language.Other],
|
162 |
+
label="Programming Language",
|
163 |
multiselect=False,
|
164 |
+
value="Python",
|
165 |
interactive=True,
|
166 |
)
|
167 |
+
framework = gr.Textbox(label="Framework/Ecosystem (e.g., PyTorch, React)")
|
168 |
+
repository_url = gr.Textbox(label="Repository URL")
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
+
submit_button = gr.Button("Submit for Assessment")
|
171 |
submission_result = gr.Markdown()
|
172 |
submit_button.click(
|
173 |
add_new_eval,
|
174 |
[
|
175 |
+
library_name_textbox,
|
176 |
+
library_version_textbox,
|
177 |
+
repository_url,
|
178 |
+
language,
|
179 |
+
framework,
|
180 |
+
library_type,
|
181 |
],
|
182 |
submission_result,
|
183 |
)
|
src/about.py
CHANGED
@@ -11,62 +11,96 @@ class Task:
|
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
-
#
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
## How
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
EVALUATION_QUEUE_TEXT = """
|
41 |
-
## Some good practices before submitting a model
|
42 |
|
43 |
-
|
|
|
44 |
```python
|
45 |
-
from
|
46 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
47 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
-
```
|
50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
-
|
52 |
-
Note: make sure your model is public!
|
53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
|
55 |
-
|
56 |
-
|
57 |
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
"""
|
69 |
|
70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
+
# Risk domains from LibVulnWatch paper
|
15 |
+
license = Task("license_validation", "score", "License Risk")
|
16 |
+
security = Task("security_assessment", "score", "Security Risk")
|
17 |
+
maintenance = Task("maintenance_health", "score", "Maintenance Risk")
|
18 |
+
dependency = Task("dependency_management", "score", "Dependency Risk")
|
19 |
+
regulatory = Task("regulatory_compliance", "score", "Regulatory Risk")
|
20 |
+
|
21 |
+
NUM_FEWSHOT = 0 # Not relevant for vulnerability assessment
|
22 |
# ---------------------------------------------------
|
23 |
|
24 |
|
25 |
|
26 |
# Your leaderboard name
|
27 |
+
TITLE = """<h1 align="center" id="space-title">LibVulnWatch: Vulnerability Assessment Leaderboard</h1>"""
|
28 |
|
29 |
# What does your leaderboard evaluate?
|
30 |
INTRODUCTION_TEXT = """
|
31 |
+
## Systematic Vulnerability Assessment and Leaderboard Tracking for Open-Source AI Libraries
|
32 |
+
|
33 |
+
This leaderboard provides continuous vulnerability assessment for open-source AI libraries across five critical risk domains:
|
34 |
+
- **License Validation**: Legal risks based on license type, compatibility, and requirements
|
35 |
+
- **Security Assessment**: Vulnerability severity and patch responsiveness
|
36 |
+
- **Maintenance Health**: Sustainability and governance practices
|
37 |
+
- **Dependency Management**: Vulnerability inheritance and supply chain security
|
38 |
+
- **Regulatory Compliance**: Compliance readiness for various frameworks
|
39 |
+
|
40 |
+
Lower scores indicate fewer vulnerabilities and lower risk. The overall risk score is a weighted average of all domains, with security given higher priority.
|
41 |
"""
|
42 |
|
43 |
# Which evaluations are you running? how can people reproduce what you have?
|
44 |
LLM_BENCHMARKS_TEXT = f"""
|
45 |
+
## How LibVulnWatch Works
|
46 |
|
47 |
+
Our assessment methodology evaluates libraries through:
|
48 |
+
1. **Static Analysis**: Code review, license parsing, and documentation examination
|
49 |
+
2. **Dynamic Analysis**: Vulnerability scanning, dependency checking, and API testing
|
50 |
+
3. **Metadata Analysis**: Repository metrics, contributor patterns, and release cadence
|
51 |
|
52 |
+
Each library receives a risk score (0-10) in each domain, with lower scores indicating lower risk.
|
|
|
|
|
|
|
53 |
|
54 |
+
## Reproducibility
|
55 |
+
To reproduce our assessment for a specific library:
|
56 |
```python
|
57 |
+
from libvulnwatch import VulnerabilityAssessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
# Initialize the assessor
|
60 |
+
assessor = VulnerabilityAssessor()
|
61 |
|
62 |
+
# Run assessment on a library
|
63 |
+
results = assessor.assess_library("organization/library_name")
|
64 |
|
65 |
+
# View detailed results
|
66 |
+
print(results.risk_scores)
|
67 |
+
print(results.detailed_findings)
|
68 |
+
```
|
69 |
+
"""
|
70 |
|
71 |
+
EVALUATION_QUEUE_TEXT = """
|
72 |
+
## Before submitting a library for assessment
|
73 |
+
|
74 |
+
### 1) Ensure your library is publicly accessible
|
75 |
+
LibVulnWatch can only assess libraries that are publicly available on GitHub or another accessible repository.
|
76 |
+
|
77 |
+
### 2) Verify complete metadata is available
|
78 |
+
Our assessment relies on metadata including:
|
79 |
+
- License information
|
80 |
+
- Dependency specifications
|
81 |
+
- Maintenance history and contributor information
|
82 |
+
- Security policies and vulnerability handling processes
|
83 |
+
|
84 |
+
### 3) Make sure your repository has an open license
|
85 |
+
This leaderboard is designed for open-source AI libraries, which should have clear licensing terms.
|
86 |
+
|
87 |
+
### 4) Add security documentation
|
88 |
+
Libraries with comprehensive security documentation tend to receive better assessments.
|
89 |
+
|
90 |
+
## If your assessment fails
|
91 |
+
If your library shows as "FAILED" in the assessment queue, check that:
|
92 |
+
- The repository is publicly accessible
|
93 |
+
- All required metadata files are present
|
94 |
+
- Dependencies can be resolved
|
95 |
+
- The repository doesn't employ obfuscation techniques that interfere with analysis
|
96 |
"""
|
97 |
|
98 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
99 |
CITATION_BUTTON_TEXT = r"""
|
100 |
+
@article{LibVulnWatch2025,
|
101 |
+
title={LibVulnWatch: Systematic Vulnerability Assessment and Leaderboard Tracking for Open-Source AI Libraries},
|
102 |
+
author={First Author and Second Author},
|
103 |
+
journal={ICML 2025 Technical AI Governance Workshop},
|
104 |
+
year={2025}
|
105 |
+
}
|
106 |
"""
|
src/display/formatting.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
def model_hyperlink(link, model_name):
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
@@ -25,3 +29,44 @@ def has_no_nan_values(df, columns):
|
|
25 |
|
26 |
def has_nan_values(df, columns):
|
27 |
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Helper functions to style our gradio elements"""
|
2 |
+
|
3 |
+
import re
|
4 |
+
|
5 |
def model_hyperlink(link, model_name):
|
6 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
7 |
|
|
|
29 |
|
30 |
def has_nan_values(df, columns):
|
31 |
return df[columns].isna().any(axis=1)
|
32 |
+
|
33 |
+
|
34 |
+
def make_clickable_library(library_name: str) -> str:
|
35 |
+
"""Link to the GitHub repository"""
|
36 |
+
library_path = library_name.replace(" ", "-").lower()
|
37 |
+
|
38 |
+
# If this is a GitHub repository, link directly
|
39 |
+
github_url = f"https://github.com/{library_path}"
|
40 |
+
|
41 |
+
return f'<a href="{github_url}" target="_blank">{library_name}</a>'
|
42 |
+
|
43 |
+
|
44 |
+
def styled_message(message) -> str:
|
45 |
+
"""Format a message with a green header"""
|
46 |
+
return f'<span style="color: green">β
Success:</span> {message}'
|
47 |
+
|
48 |
+
|
49 |
+
def styled_warning(message) -> str:
|
50 |
+
"""Format a warning message with an orange header"""
|
51 |
+
return f'<span style="color: orange">β οΈ Warning:</span> {message}'
|
52 |
+
|
53 |
+
|
54 |
+
def styled_error(message) -> str:
|
55 |
+
"""Format an error message with a red header"""
|
56 |
+
return f'<span style="color: red">β Error:</span> {message}'
|
57 |
+
|
58 |
+
|
59 |
+
# Risk severity coloring for risk scores
|
60 |
+
def colorize_risk_score(score):
|
61 |
+
"""
|
62 |
+
Apply color coding to risk scores:
|
63 |
+
0-3.9: Green (Low risk)
|
64 |
+
4-6.9: Orange (Medium risk)
|
65 |
+
7-10: Red (High risk)
|
66 |
+
"""
|
67 |
+
if score < 4:
|
68 |
+
return f'<span style="color: green">{score:.1f}</span>'
|
69 |
+
elif score < 7:
|
70 |
+
return f'<span style="color: orange">{score:.1f}</span>'
|
71 |
+
else:
|
72 |
+
return f'<span style="color: red">{score:.1f}</span>'
|
src/display/utils.py
CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["
|
27 |
-
auto_eval_column_dict.append(["
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
-
#
|
33 |
-
auto_eval_column_dict.append(["
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
auto_eval_column_dict.append(["
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -46,59 +46,58 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
56 |
-
## All the
|
57 |
@dataclass
|
58 |
-
class
|
59 |
name: str
|
60 |
display_name: str = ""
|
61 |
symbol: str = "" # emoji
|
62 |
|
63 |
|
64 |
-
class
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
70 |
|
71 |
def to_str(self, separator=" "):
|
72 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
73 |
|
74 |
@staticmethod
|
75 |
def from_str(type):
|
76 |
-
if "
|
77 |
-
return
|
78 |
-
if "
|
79 |
-
return
|
80 |
-
if "
|
81 |
-
return
|
82 |
-
if "
|
83 |
-
return
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
return Precision.bfloat16
|
101 |
-
return Precision.Unknown
|
102 |
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
auto_eval_column_dict.append(["library_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["library", ColumnContent, ColumnContent("Library", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
auto_eval_column_dict.append(["overall_risk", ColumnContent, ColumnContent("Overall Risk β¬οΈ", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
+
# Library information
|
33 |
+
auto_eval_column_dict.append(["library_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
+
auto_eval_column_dict.append(["framework", ColumnContent, ColumnContent("Framework", "str", False)])
|
35 |
+
auto_eval_column_dict.append(["version", ColumnContent, ColumnContent("Version", "str", False, False)])
|
36 |
+
auto_eval_column_dict.append(["language", ColumnContent, ColumnContent("Language", "str", False)])
|
37 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", True)])
|
38 |
+
auto_eval_column_dict.append(["stars", ColumnContent, ColumnContent("GitHub β", "number", False)])
|
39 |
+
auto_eval_column_dict.append(["last_update", ColumnContent, ColumnContent("Last Updated", "str", False)])
|
40 |
+
auto_eval_column_dict.append(["verified", ColumnContent, ColumnContent("Independently Verified", "bool", False)])
|
41 |
+
auto_eval_column_dict.append(["availability", ColumnContent, ColumnContent("Active Maintenance", "bool", True)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
+
library = ColumnContent("library", "markdown", True)
|
50 |
+
version = ColumnContent("version", "str", True)
|
51 |
+
language = ColumnContent("language", "str", True)
|
52 |
+
framework = ColumnContent("framework", "str", True)
|
53 |
+
library_type = ColumnContent("library_type", "str", True)
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
56 |
+
## All the library information that we might need
|
57 |
@dataclass
|
58 |
+
class LibraryDetails:
|
59 |
name: str
|
60 |
display_name: str = ""
|
61 |
symbol: str = "" # emoji
|
62 |
|
63 |
|
64 |
+
class LibraryType(Enum):
|
65 |
+
ML = LibraryDetails(name="machine learning", symbol="π’")
|
66 |
+
LLM = LibraryDetails(name="llm framework", symbol="πΆ")
|
67 |
+
AGENT = LibraryDetails(name="agent framework", symbol="β")
|
68 |
+
VIS = LibraryDetails(name="visualization", symbol="π¦")
|
69 |
+
GENERAL = LibraryDetails(name="general ai", symbol="π£")
|
70 |
+
Unknown = LibraryDetails(name="", symbol="?")
|
71 |
|
72 |
def to_str(self, separator=" "):
|
73 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
74 |
|
75 |
@staticmethod
|
76 |
def from_str(type):
|
77 |
+
if "machine learning" in type or "π’" in type:
|
78 |
+
return LibraryType.ML
|
79 |
+
if "llm framework" in type or "πΆ" in type:
|
80 |
+
return LibraryType.LLM
|
81 |
+
if "agent framework" in type or "β" in type:
|
82 |
+
return LibraryType.AGENT
|
83 |
+
if "visualization" in type or "π¦" in type:
|
84 |
+
return LibraryType.VIS
|
85 |
+
if "general ai" in type or "π£" in type:
|
86 |
+
return LibraryType.GENERAL
|
87 |
+
return LibraryType.Unknown
|
88 |
+
|
89 |
+
class Language(Enum):
|
90 |
+
Python = LibraryDetails("Python")
|
91 |
+
JavaScript = LibraryDetails("JavaScript")
|
92 |
+
TypeScript = LibraryDetails("TypeScript")
|
93 |
+
Java = LibraryDetails("Java")
|
94 |
+
CPP = LibraryDetails("C++")
|
95 |
+
Other = LibraryDetails("Other")
|
96 |
+
|
97 |
+
class AssessmentStatus(Enum):
|
98 |
+
Verified = LibraryDetails("Verified")
|
99 |
+
Unverified = LibraryDetails("Unverified")
|
100 |
+
Disputed = LibraryDetails("Disputed")
|
|
|
|
|
101 |
|
102 |
# Column selection
|
103 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
src/envs.py
CHANGED
@@ -6,20 +6,20 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "
|
21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "
|
22 |
-
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "
|
23 |
-
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "libvulnwatch" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
+
QUEUE_REPO = f"{OWNER}/vulnerability-requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/vulnerability-assessments"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "assessment-queue")
|
21 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "assessment-results")
|
22 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "assessment-queue-bk")
|
23 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "assessment-results-bk")
|
24 |
|
25 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -3,144 +3,156 @@ import json
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
|
|
6 |
|
7 |
-
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
-
from src.display.formatting import
|
11 |
-
from src.display.utils import AutoEvalColumn,
|
12 |
-
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
-
class
|
17 |
-
"""Represents one full
|
18 |
"""
|
19 |
-
|
20 |
-
|
21 |
org: str
|
22 |
-
|
23 |
-
|
24 |
-
results: dict
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
@classmethod
|
36 |
def init_from_json_file(self, json_filepath):
|
37 |
-
"""
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
40 |
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
-
|
46 |
-
# Get model and org
|
47 |
-
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
-
org_and_model = org_and_model.split("/", 1)
|
49 |
-
|
50 |
-
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
-
|
53 |
-
|
54 |
else:
|
55 |
-
org =
|
56 |
-
|
57 |
-
|
58 |
-
full_model = "/".join(org_and_model)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
)
|
63 |
-
architecture = "?"
|
64 |
-
if model_config is not None:
|
65 |
-
architectures = getattr(model_config, "architectures", None)
|
66 |
-
if architectures:
|
67 |
-
architecture = ";".join(architectures)
|
68 |
-
|
69 |
-
# Extract results available in this file (some results are split in several files)
|
70 |
-
results = {}
|
71 |
for task in Tasks:
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
return self(
|
83 |
-
|
84 |
-
|
85 |
org=org,
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
-
"""Finds the relevant request file for the current
|
96 |
-
request_file =
|
97 |
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
101 |
-
self.
|
102 |
-
self.
|
103 |
-
self.license = request.get("license", "?")
|
104 |
-
self.likes = request.get("likes", 0)
|
105 |
-
self.num_params = request.get("params", 0)
|
106 |
-
self.date = request.get("submitted_time", "")
|
107 |
except Exception:
|
108 |
-
print(f"Could not find request file for {self.
|
109 |
|
110 |
def to_dict(self):
|
111 |
-
"""Converts the
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
data_dict = {
|
114 |
-
"
|
115 |
-
AutoEvalColumn.
|
116 |
-
AutoEvalColumn.
|
117 |
-
AutoEvalColumn.
|
118 |
-
AutoEvalColumn.
|
119 |
-
AutoEvalColumn.
|
120 |
-
AutoEvalColumn.
|
121 |
-
AutoEvalColumn.
|
122 |
-
AutoEvalColumn.average.name: average,
|
123 |
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.
|
125 |
-
AutoEvalColumn.
|
126 |
-
AutoEvalColumn.
|
|
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
130 |
-
data_dict[task.
|
131 |
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
-
def
|
136 |
-
"""Selects the correct request file for a given
|
137 |
request_files = os.path.join(
|
138 |
requests_path,
|
139 |
-
f"{
|
140 |
)
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
143 |
-
# Select correct request file (
|
144 |
request_file = ""
|
145 |
request_files = sorted(request_files, reverse=True)
|
146 |
for tmp_request_file in request_files:
|
@@ -148,45 +160,45 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
148 |
req_content = json.load(f)
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
-
and req_content["
|
152 |
):
|
153 |
request_file = tmp_request_file
|
154 |
return request_file
|
155 |
|
156 |
|
157 |
-
def
|
158 |
-
"""From the path of the results folder root, extract all needed info for
|
159 |
-
|
160 |
|
161 |
for root, _, files in os.walk(results_path):
|
162 |
-
# We should only have json files in
|
163 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
continue
|
165 |
|
166 |
-
# Sort the files by date
|
167 |
try:
|
168 |
-
files.sort(key=lambda x:
|
169 |
-
except
|
170 |
-
|
171 |
|
172 |
for file in files:
|
173 |
-
|
174 |
|
175 |
-
|
176 |
-
for
|
177 |
# Creation of result
|
178 |
-
|
179 |
-
|
180 |
|
181 |
# Store results of same eval together
|
182 |
-
|
183 |
-
if
|
184 |
-
|
185 |
else:
|
186 |
-
|
187 |
|
188 |
results = []
|
189 |
-
for v in
|
190 |
try:
|
191 |
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
|
|
3 |
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
+
from datetime import datetime
|
7 |
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
+
from src.display.formatting import make_clickable_library
|
11 |
+
from src.display.utils import AutoEvalColumn, LibraryType, Tasks, Language, AssessmentStatus
|
|
|
12 |
|
13 |
|
14 |
@dataclass
|
15 |
+
class AssessmentResult:
|
16 |
+
"""Represents one full vulnerability assessment. Built from a combination of the result and request file for a given library.
|
17 |
"""
|
18 |
+
assessment_id: str # Unique identifier
|
19 |
+
library_name: str # org/repo
|
20 |
org: str
|
21 |
+
repo: str
|
22 |
+
version: str
|
23 |
+
results: dict # Risk scores
|
24 |
+
framework: str = ""
|
25 |
+
language: Language = Language.Other
|
26 |
+
library_type: LibraryType = LibraryType.Unknown
|
|
|
27 |
license: str = "?"
|
28 |
+
stars: int = 0
|
29 |
+
last_update: str = ""
|
30 |
+
availability: bool = True
|
31 |
+
verified: bool = False
|
32 |
|
33 |
@classmethod
|
34 |
def init_from_json_file(self, json_filepath):
|
35 |
+
"""Initializes the assessment result from a JSON file"""
|
36 |
with open(json_filepath) as fp:
|
37 |
data = json.load(fp)
|
38 |
|
39 |
+
assessment = data.get("assessment", {})
|
40 |
+
# Get library and org
|
41 |
+
library_name = assessment.get("library_name", "")
|
42 |
+
org_and_repo = library_name.split("/", 1)
|
43 |
|
44 |
+
if len(org_and_repo) == 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
org = None
|
46 |
+
repo = org_and_repo[0]
|
47 |
+
assessment_id = f"{repo}_{assessment.get('version', '')}"
|
48 |
else:
|
49 |
+
org = org_and_repo[0]
|
50 |
+
repo = org_and_repo[1]
|
51 |
+
assessment_id = f"{org}_{repo}_{assessment.get('version', '')}"
|
|
|
52 |
|
53 |
+
# Extract risk scores
|
54 |
+
risk_scores = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
for task in Tasks:
|
56 |
+
domain = task.value
|
57 |
+
score = assessment.get("scores", {}).get(domain.benchmark, None)
|
58 |
+
if score is not None:
|
59 |
+
risk_scores[domain.benchmark] = score
|
60 |
+
|
61 |
+
# Library metadata
|
62 |
+
framework = assessment.get("framework", "")
|
63 |
+
language_str = assessment.get("language", "Other")
|
64 |
+
language = next((lang for lang in Language if lang.value.name == language_str), Language.Other)
|
65 |
+
|
66 |
+
# Availability and verification
|
67 |
+
last_update = assessment.get("last_updated", "")
|
68 |
+
if last_update:
|
69 |
+
try:
|
70 |
+
# Format date for display
|
71 |
+
dt = datetime.fromisoformat(last_update)
|
72 |
+
last_update = dt.strftime("%Y-%m-%d")
|
73 |
+
except:
|
74 |
+
pass
|
75 |
+
|
76 |
return self(
|
77 |
+
assessment_id=assessment_id,
|
78 |
+
library_name=library_name,
|
79 |
org=org,
|
80 |
+
repo=repo,
|
81 |
+
version=assessment.get("version", ""),
|
82 |
+
results=risk_scores,
|
83 |
+
framework=framework,
|
84 |
+
language=language,
|
85 |
+
license=assessment.get("license", "?"),
|
86 |
+
availability=assessment.get("active_maintenance", True),
|
87 |
+
verified=assessment.get("independently_verified", False),
|
88 |
+
last_update=last_update,
|
89 |
)
|
90 |
|
91 |
def update_with_request_file(self, requests_path):
|
92 |
+
"""Finds the relevant request file for the current library and updates info with it"""
|
93 |
+
request_file = get_request_file_for_library(requests_path, self.library_name, self.version)
|
94 |
|
95 |
try:
|
96 |
with open(request_file, "r") as f:
|
97 |
request = json.load(f)
|
98 |
+
self.library_type = LibraryType.from_str(request.get("library_type", ""))
|
99 |
+
self.stars = request.get("stars", 0)
|
|
|
|
|
|
|
|
|
100 |
except Exception:
|
101 |
+
print(f"Could not find request file for {self.library_name} version {self.version}")
|
102 |
|
103 |
def to_dict(self):
|
104 |
+
"""Converts the Assessment Result to a dict compatible with our dataframe display"""
|
105 |
+
# Calculate overall risk as weighted average
|
106 |
+
weights = {
|
107 |
+
"license_validation": 0.2,
|
108 |
+
"security_assessment": 0.3,
|
109 |
+
"maintenance_health": 0.2,
|
110 |
+
"dependency_management": 0.2,
|
111 |
+
"regulatory_compliance": 0.1
|
112 |
+
}
|
113 |
+
|
114 |
+
# Calculate overall risk - if domain is missing, use highest risk score (10)
|
115 |
+
risk_sum = 0
|
116 |
+
weight_sum = 0
|
117 |
+
|
118 |
+
for domain, weight in weights.items():
|
119 |
+
score = self.results.get(domain, 10) # Default to highest risk if missing
|
120 |
+
risk_sum += score * weight
|
121 |
+
weight_sum += weight
|
122 |
+
|
123 |
+
overall_risk = risk_sum / weight_sum if weight_sum > 0 else 10
|
124 |
+
|
125 |
data_dict = {
|
126 |
+
"assessment_id": self.assessment_id, # not a column, just a save name
|
127 |
+
AutoEvalColumn.library_type.name: self.library_type.value.name,
|
128 |
+
AutoEvalColumn.library_type_symbol.name: self.library_type.value.symbol,
|
129 |
+
AutoEvalColumn.language.name: self.language.value.name,
|
130 |
+
AutoEvalColumn.framework.name: self.framework,
|
131 |
+
AutoEvalColumn.library.name: make_clickable_library(self.library_name),
|
132 |
+
AutoEvalColumn.version.name: self.version,
|
133 |
+
AutoEvalColumn.overall_risk.name: overall_risk,
|
|
|
134 |
AutoEvalColumn.license.name: self.license,
|
135 |
+
AutoEvalColumn.stars.name: self.stars,
|
136 |
+
AutoEvalColumn.last_update.name: self.last_update,
|
137 |
+
AutoEvalColumn.verified.name: self.verified,
|
138 |
+
AutoEvalColumn.availability.name: self.availability,
|
139 |
}
|
140 |
|
141 |
for task in Tasks:
|
142 |
+
data_dict[task.name] = self.results.get(task.value.benchmark, 10) # Default to highest risk
|
143 |
|
144 |
return data_dict
|
145 |
|
146 |
|
147 |
+
def get_request_file_for_library(requests_path, library_name, version):
|
148 |
+
"""Selects the correct request file for a given library. Only keeps runs tagged as FINISHED"""
|
149 |
request_files = os.path.join(
|
150 |
requests_path,
|
151 |
+
f"{library_name.replace('/', '_')}_eval_request_*.json",
|
152 |
)
|
153 |
request_files = glob.glob(request_files)
|
154 |
|
155 |
+
# Select correct request file (version)
|
156 |
request_file = ""
|
157 |
request_files = sorted(request_files, reverse=True)
|
158 |
for tmp_request_file in request_files:
|
|
|
160 |
req_content = json.load(f)
|
161 |
if (
|
162 |
req_content["status"] in ["FINISHED"]
|
163 |
+
and req_content["version"] == version
|
164 |
):
|
165 |
request_file = tmp_request_file
|
166 |
return request_file
|
167 |
|
168 |
|
169 |
+
def get_raw_assessment_results(results_path: str, requests_path: str) -> list[AssessmentResult]:
|
170 |
+
"""From the path of the results folder root, extract all needed info for assessments"""
|
171 |
+
assessment_filepaths = []
|
172 |
|
173 |
for root, _, files in os.walk(results_path):
|
174 |
+
# We should only have json files in assessment results
|
175 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
176 |
continue
|
177 |
|
178 |
+
# Sort the files by date if they have date info
|
179 |
try:
|
180 |
+
files.sort(key=lambda x: datetime.fromisoformat(json.loads(open(os.path.join(root, x)).read())["assessment"]["completed_time"]), reverse=True)
|
181 |
+
except:
|
182 |
+
pass
|
183 |
|
184 |
for file in files:
|
185 |
+
assessment_filepaths.append(os.path.join(root, file))
|
186 |
|
187 |
+
assessment_results = {}
|
188 |
+
for assessment_filepath in assessment_filepaths:
|
189 |
# Creation of result
|
190 |
+
assessment_result = AssessmentResult.init_from_json_file(assessment_filepath)
|
191 |
+
assessment_result.update_with_request_file(requests_path)
|
192 |
|
193 |
# Store results of same eval together
|
194 |
+
assessment_id = assessment_result.assessment_id
|
195 |
+
if assessment_id in assessment_results.keys():
|
196 |
+
assessment_results[assessment_id].results.update({k: v for k, v in assessment_result.results.items() if v is not None})
|
197 |
else:
|
198 |
+
assessment_results[assessment_id] = assessment_result
|
199 |
|
200 |
results = []
|
201 |
+
for v in assessment_results.values():
|
202 |
try:
|
203 |
v.to_dict() # we test if the dict version is complete
|
204 |
results.append(v)
|
src/populate.py
CHANGED
@@ -1,58 +1,100 @@
|
|
1 |
-
|
2 |
-
import os
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.display.
|
7 |
-
from src.
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(
|
12 |
-
"""
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
-
df = df[cols].round(decimals=2)
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Functions to populate the leaderboard"""
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
|
5 |
+
from src.display.utils import AutoEvalColumn
|
6 |
+
from src.leaderboard.read_evals import get_raw_assessment_results
|
|
|
7 |
|
8 |
|
9 |
+
def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
|
10 |
+
"""Read all the runs in the folder and return a dataframe
|
11 |
+
|
12 |
+
Args:
|
13 |
+
eval_results_path: Path to the assessment result files
|
14 |
+
eval_requests_path: Path to the assessment request files
|
15 |
+
cols: Columns names to include in the dataframe
|
16 |
+
benchmark_cols: Risk categories column names
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
Pandas dataframe for the leaderboard
|
20 |
+
"""
|
21 |
+
try:
|
22 |
+
assessment_results = get_raw_assessment_results(eval_results_path, eval_requests_path)
|
23 |
+
|
24 |
+
# If we get results, convert to dataframe
|
25 |
+
if len(assessment_results) > 0:
|
26 |
+
# Create dataframe from assessment results
|
27 |
+
all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
|
28 |
+
|
29 |
+
# Sort by overall risk score (ascending - lower is better)
|
30 |
+
if AutoEvalColumn.overall_risk.name in all_df.columns:
|
31 |
+
all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
|
32 |
+
|
33 |
+
return all_df
|
34 |
+
|
35 |
+
return pd.DataFrame(columns=cols) # Empty dataframe with columns
|
36 |
+
except Exception as e:
|
37 |
+
print(f"Error reading evaluation results: {e}")
|
38 |
+
return pd.DataFrame(columns=cols) # Return empty dataframe
|
39 |
|
|
|
|
|
|
|
40 |
|
41 |
+
def get_evaluation_queue_df(eval_requests_path, eval_cols):
|
42 |
+
"""Read from the evaluation queue directory and return dataframes for each status
|
43 |
+
|
44 |
+
Args:
|
45 |
+
eval_requests_path: Path to the assessment request files
|
46 |
+
eval_cols: Columns for the queue dataframes
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
Tuple of dataframes (finished, running, pending)
|
50 |
+
"""
|
51 |
+
try:
|
52 |
+
import glob
|
53 |
+
import json
|
54 |
+
import os
|
55 |
+
|
56 |
+
# Find all request files
|
57 |
+
request_files = glob.glob(os.path.join(eval_requests_path, "*.json"))
|
58 |
+
|
59 |
+
finished_data = []
|
60 |
+
running_data = []
|
61 |
+
pending_data = []
|
62 |
+
|
63 |
+
for file_path in request_files:
|
64 |
+
try:
|
65 |
+
with open(file_path, "r") as f:
|
66 |
+
data = json.load(f)
|
67 |
+
|
68 |
+
# Extract relevant fields
|
69 |
+
row = {
|
70 |
+
"library": data.get("library", ""),
|
71 |
+
"version": data.get("version", ""),
|
72 |
+
"language": data.get("language", ""),
|
73 |
+
"framework": data.get("framework", ""),
|
74 |
+
"library_type": data.get("library_type", ""),
|
75 |
+
"status": data.get("status", "UNKNOWN")
|
76 |
+
}
|
77 |
+
|
78 |
+
# Add to appropriate dataframe based on status
|
79 |
+
if row["status"] == "FINISHED":
|
80 |
+
finished_data.append(row)
|
81 |
+
elif row["status"] == "RUNNING":
|
82 |
+
running_data.append(row)
|
83 |
+
elif row["status"] == "PENDING":
|
84 |
+
pending_data.append(row)
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Error reading request file {file_path}: {e}")
|
87 |
+
continue
|
88 |
+
|
89 |
+
# Convert to dataframes
|
90 |
+
finished_df = pd.DataFrame(finished_data, columns=eval_cols)
|
91 |
+
running_df = pd.DataFrame(running_data, columns=eval_cols)
|
92 |
+
pending_df = pd.DataFrame(pending_data, columns=eval_cols)
|
93 |
+
|
94 |
+
return finished_df, running_df, pending_df
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
print(f"Error reading evaluation queue: {e}")
|
98 |
+
# Return empty dataframes
|
99 |
+
empty_df = pd.DataFrame(columns=eval_cols)
|
100 |
+
return empty_df.copy(), empty_df.copy(), empty_df.copy()
|
src/submission/check_validity.py
CHANGED
@@ -1,99 +1,113 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
|
|
4 |
from collections import defaultdict
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
try:
|
37 |
-
|
38 |
-
if
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
except Exception as e:
|
48 |
-
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
-
return True, None, config
|
50 |
-
|
51 |
-
except ValueError:
|
52 |
-
return (
|
53 |
-
False,
|
54 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
-
None
|
56 |
-
)
|
57 |
-
|
58 |
except Exception as e:
|
59 |
-
return False, "
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
depth = 1
|
80 |
-
|
81 |
-
|
82 |
|
83 |
-
for root, _, files in os.walk(
|
84 |
-
current_depth = root.count(os.sep) -
|
85 |
if current_depth == depth:
|
86 |
for file in files:
|
87 |
if not file.endswith(".json"):
|
88 |
continue
|
89 |
with open(os.path.join(root, file), "r") as f:
|
90 |
info = json.load(f)
|
91 |
-
|
92 |
|
93 |
# Select organisation
|
94 |
-
if info["
|
95 |
continue
|
96 |
-
organisation, _ = info["
|
97 |
-
|
98 |
|
99 |
-
return set(
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
4 |
+
import requests
|
5 |
from collections import defaultdict
|
6 |
from datetime import datetime, timedelta, timezone
|
7 |
+
from typing import Dict, Tuple, Any, List, Set
|
8 |
+
|
9 |
+
def is_repository_valid(repo_name: str, repo_url: str) -> Tuple[bool, str, Dict[str, Any]]:
|
10 |
+
"""
|
11 |
+
Checks if a GitHub repository is valid and accessible.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
repo_name: The name of the repository (org/repo format)
|
15 |
+
repo_url: URL to the repository
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
Tuple of (is_valid, error_message, library_info)
|
19 |
+
"""
|
20 |
+
# Basic format validation
|
21 |
+
if not repo_name or "/" not in repo_name:
|
22 |
+
return False, "Repository name must be in the format 'organization/repository'", {}
|
23 |
+
|
24 |
+
# Check if GitHub URL
|
25 |
+
if repo_url and "github.com" in repo_url:
|
26 |
+
# Extract org and repo from URL if provided
|
27 |
+
try:
|
28 |
+
parts = repo_url.split("github.com/")[1].split("/")
|
29 |
+
org = parts[0]
|
30 |
+
repo = parts[1].split(".")[0] if "." in parts[1] else parts[1]
|
31 |
+
url_repo_name = f"{org}/{repo}"
|
32 |
+
|
33 |
+
# Check if URL matches repo_name
|
34 |
+
if url_repo_name != repo_name:
|
35 |
+
return False, f"Repository name ({repo_name}) doesn't match the URL ({url_repo_name})", {}
|
36 |
+
except:
|
37 |
+
pass # Fall back to using repo_name
|
38 |
+
|
39 |
+
# Get repository information from GitHub API
|
40 |
+
org, repo = repo_name.split("/")
|
41 |
+
api_url = f"https://api.github.com/repos/{org}/{repo}"
|
42 |
+
|
43 |
try:
|
44 |
+
response = requests.get(api_url)
|
45 |
+
if response.status_code != 200:
|
46 |
+
return False, f"Repository not found or not accessible: {response.json().get('message', 'Unknown error')}", {}
|
47 |
+
|
48 |
+
# Parse repository data
|
49 |
+
repo_data = response.json()
|
50 |
+
library_info = get_library_info(repo_data)
|
51 |
+
|
52 |
+
return True, "", library_info
|
53 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
except Exception as e:
|
55 |
+
return False, f"Error accessing repository: {str(e)}", {}
|
56 |
+
|
57 |
+
def get_library_info(repo_data: Dict[str, Any]) -> Dict[str, Any]:
|
58 |
+
"""
|
59 |
+
Extracts relevant information from GitHub repository data.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
repo_data: GitHub API response for a repository
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
Dictionary with library metadata
|
66 |
+
"""
|
67 |
+
library_info = {
|
68 |
+
"name": repo_data.get("name", ""),
|
69 |
+
"full_name": repo_data.get("full_name", ""),
|
70 |
+
"description": repo_data.get("description", ""),
|
71 |
+
"stars": repo_data.get("stargazers_count", 0),
|
72 |
+
"forks": repo_data.get("forks_count", 0),
|
73 |
+
"license": repo_data.get("license", {}).get("name", "Unknown"),
|
74 |
+
"created_at": repo_data.get("created_at", ""),
|
75 |
+
"updated_at": repo_data.get("updated_at", ""),
|
76 |
+
"open_issues": repo_data.get("open_issues_count", 0),
|
77 |
+
"default_branch": repo_data.get("default_branch", "main"),
|
78 |
+
"is_archived": repo_data.get("archived", False),
|
79 |
+
}
|
80 |
+
|
81 |
+
return library_info
|
82 |
+
|
83 |
+
def already_submitted_libraries(requested_libraries_dir: str) -> Tuple[Set[str], Dict[str, List[str]]]:
|
84 |
+
"""
|
85 |
+
Gathers a list of already submitted libraries to avoid duplicates.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
requested_libraries_dir: Directory with library assessment requests
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
Tuple of (set of library identifiers, dict mapping orgs to submission dates)
|
92 |
+
"""
|
93 |
depth = 1
|
94 |
+
library_ids = []
|
95 |
+
orgs_to_submission_dates = defaultdict(list)
|
96 |
|
97 |
+
for root, _, files in os.walk(requested_libraries_dir):
|
98 |
+
current_depth = root.count(os.sep) - requested_libraries_dir.count(os.sep)
|
99 |
if current_depth == depth:
|
100 |
for file in files:
|
101 |
if not file.endswith(".json"):
|
102 |
continue
|
103 |
with open(os.path.join(root, file), "r") as f:
|
104 |
info = json.load(f)
|
105 |
+
library_ids.append(f"{info['library']}_{info['version']}")
|
106 |
|
107 |
# Select organisation
|
108 |
+
if info["library"].count("/") == 0 or "submitted_time" not in info:
|
109 |
continue
|
110 |
+
organisation, _ = info["library"].split("/")
|
111 |
+
orgs_to_submission_dates[organisation].append(info["submitted_time"])
|
112 |
|
113 |
+
return set(library_ids), orgs_to_submission_dates
|
src/submission/submit.py
CHANGED
@@ -1,119 +1,94 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
from
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
|
14 |
-
|
15 |
-
|
|
|
16 |
|
17 |
-
def add_new_eval(
|
18 |
-
model: str,
|
19 |
-
base_model: str,
|
20 |
-
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
23 |
-
model_type: str,
|
24 |
-
):
|
25 |
-
global REQUESTED_MODELS
|
26 |
-
global USERS_TO_SUBMISSION_DATES
|
27 |
-
if not REQUESTED_MODELS:
|
28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
-
|
30 |
-
user_name = ""
|
31 |
-
model_path = model
|
32 |
-
if "/" in model:
|
33 |
-
user_name = model.split("/")[0]
|
34 |
-
model_path = model.split("/")[1]
|
35 |
-
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
-
|
39 |
-
if model_type is None or model_type == "":
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
-
|
42 |
-
# Does the model actually exist?
|
43 |
-
if revision == "":
|
44 |
-
revision = "main"
|
45 |
-
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
-
if not base_model_on_hub:
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
-
|
52 |
-
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
-
if not model_on_hub:
|
55 |
-
return styled_error(f'Model "{model}" {error}')
|
56 |
-
|
57 |
-
# Is the model info correctly filled?
|
58 |
-
try:
|
59 |
-
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
-
except Exception:
|
61 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
-
|
63 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
-
|
65 |
-
# Were the model card and license filled?
|
66 |
-
try:
|
67 |
-
license = model_info.cardData["license"]
|
68 |
-
except Exception:
|
69 |
-
return styled_error("Please select a license for your model")
|
70 |
-
|
71 |
-
modelcard_OK, error_msg = check_model_card(model)
|
72 |
-
if not modelcard_OK:
|
73 |
-
return styled_error(error_msg)
|
74 |
-
|
75 |
-
# Seems good, creating the eval
|
76 |
-
print("Adding new eval")
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
"status": "PENDING",
|
85 |
-
"submitted_time":
|
86 |
-
"
|
87 |
-
"
|
88 |
-
"params": model_size,
|
89 |
-
"license": license,
|
90 |
-
"private": False,
|
91 |
}
|
92 |
-
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
)
|
113 |
-
|
114 |
-
# Remove the local file
|
115 |
-
os.remove(out_path)
|
116 |
-
|
117 |
-
return styled_message(
|
118 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
-
)
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import re
|
4 |
+
import time
|
5 |
+
import uuid
|
6 |
+
from datetime import datetime
|
7 |
+
from pathlib import Path
|
8 |
|
9 |
+
import huggingface_hub
|
10 |
+
import requests
|
11 |
+
from huggingface_hub import HfApi
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
from src.display.utils import LibraryType, Language, AssessmentStatus
|
14 |
+
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
15 |
+
from src.submission.check_validity import is_repository_valid, get_library_info
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
def add_new_eval(
|
19 |
+
library_name,
|
20 |
+
library_version,
|
21 |
+
repository_url,
|
22 |
+
language,
|
23 |
+
framework,
|
24 |
+
library_type_str,
|
25 |
+
) -> str:
|
26 |
+
"""
|
27 |
+
Adds a new library to the assessment queue.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
library_name: Name of the library (org/repo format)
|
31 |
+
library_version: Version of the library
|
32 |
+
repository_url: URL to the repository
|
33 |
+
language: Programming language
|
34 |
+
framework: Related framework/ecosystem
|
35 |
+
library_type_str: Type of AI library
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
A message indicating the status of the submission
|
39 |
+
"""
|
40 |
+
# Check if valid repository
|
41 |
+
is_valid, validity_message, library_info = is_repository_valid(library_name, repository_url)
|
42 |
+
|
43 |
+
if not is_valid:
|
44 |
+
return f"β οΈ Invalid submission: {validity_message}"
|
45 |
+
|
46 |
+
# Parse library type
|
47 |
+
library_type = LibraryType.from_str(library_type_str)
|
48 |
+
if library_type == LibraryType.Unknown:
|
49 |
+
return "β οΈ Please select a valid library type."
|
50 |
+
|
51 |
+
# Create a unique identifier for the submission
|
52 |
+
uid = uuid.uuid4().hex[:6]
|
53 |
+
timestamp = datetime.now().isoformat()
|
54 |
+
request_filename = f"{library_name.replace('/', '_')}_eval_request_{timestamp}_{uid}.json"
|
55 |
+
|
56 |
+
# Stars count and license info from library_info if available
|
57 |
+
stars = library_info.get("stars", 0)
|
58 |
+
license_name = library_info.get("license", "unknown")
|
59 |
+
|
60 |
+
# Create the assessment request JSON
|
61 |
+
assessment_request = {
|
62 |
+
"library": library_name,
|
63 |
+
"version": library_version,
|
64 |
+
"repository_url": repository_url,
|
65 |
+
"language": language,
|
66 |
+
"framework": framework,
|
67 |
+
"library_type": library_type.value.name,
|
68 |
+
"license": license_name,
|
69 |
+
"stars": stars,
|
70 |
"status": "PENDING",
|
71 |
+
"submitted_time": timestamp,
|
72 |
+
"last_updated": timestamp,
|
73 |
+
"assessment_id": uid
|
|
|
|
|
|
|
74 |
}
|
75 |
+
|
76 |
+
# Save the request
|
77 |
+
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
|
78 |
+
with open(os.path.join(EVAL_REQUESTS_PATH, request_filename), "w") as f:
|
79 |
+
json.dump(assessment_request, f, indent=2)
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Push the file to the HF repo
|
83 |
+
path = Path(os.path.join(EVAL_REQUESTS_PATH, request_filename))
|
84 |
+
API.upload_file(
|
85 |
+
path_or_fileobj=path,
|
86 |
+
path_in_repo=request_filename,
|
87 |
+
repo_id=QUEUE_REPO,
|
88 |
+
repo_type="dataset",
|
89 |
+
)
|
90 |
+
|
91 |
+
return f"β
Library '{library_name}' (version {library_version}) has been added to the assessment queue! Assessment ID: {uid}"
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
return f"Error uploading assessment request: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|