wu981526092 commited on
Commit
bccaf50
Β·
1 Parent(s): 4b45492
app.py CHANGED
@@ -19,10 +19,10 @@ from src.display.utils import (
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
  fields,
24
- WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -68,23 +68,23 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
- AutoEvalColumn.params.name,
78
  type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
  ),
83
  ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
  ),
86
  ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
@@ -95,20 +95,20 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
  with gr.Column():
110
  with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
  open=False,
113
  ):
114
  with gr.Row():
@@ -119,7 +119,7 @@ with demo:
119
  row_count=5,
120
  )
121
  with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
  open=False,
124
  ):
125
  with gr.Row():
@@ -131,7 +131,7 @@ with demo:
131
  )
132
 
133
  with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
  open=False,
136
  ):
137
  with gr.Row():
@@ -142,48 +142,42 @@ with demo:
142
  row_count=5,
143
  )
144
  with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
 
147
  with gr.Row():
148
  with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
  multiselect=False,
155
  value=None,
156
  interactive=True,
157
  )
158
 
159
  with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
  multiselect=False,
164
- value="float16",
165
  interactive=True,
166
  )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
- submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
178
  submit_button.click(
179
  add_new_eval,
180
  [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
  ],
188
  submission_result,
189
  )
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
+ LibraryType,
23
  fields,
24
+ Language,
25
+ AssessmentStatus
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
+ search_columns=[AutoEvalColumn.library.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.library_type.name, type="checkboxgroup", label="Library types"),
75
+ ColumnFilter(AutoEvalColumn.language.name, type="checkboxgroup", label="Programming Language"),
76
  ColumnFilter(
77
+ AutoEvalColumn.stars.name,
78
  type="slider",
79
+ min=0,
80
+ max=50000,
81
+ label="GitHub Stars",
82
  ),
83
  ColumnFilter(
84
+ AutoEvalColumn.availability.name, type="boolean", label="Show only active libraries", default=True
85
  ),
86
  ],
87
+ bool_checkboxgroup_label="Filter libraries",
88
  interactive=False,
89
  )
90
 
 
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("πŸ… Vulnerability Leaderboard", elem_id="vulnerability-leaderboard-tab", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
+ with gr.TabItem("πŸ“ About", elem_id="about-tab", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
+ with gr.TabItem("πŸš€ Submit Library", elem_id="submit-library-tab", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
  with gr.Column():
110
  with gr.Accordion(
111
+ f"βœ… Completed Assessments ({len(finished_eval_queue_df)})",
112
  open=False,
113
  ):
114
  with gr.Row():
 
119
  row_count=5,
120
  )
121
  with gr.Accordion(
122
+ f"πŸ”„ In Progress Assessments ({len(running_eval_queue_df)})",
123
  open=False,
124
  ):
125
  with gr.Row():
 
131
  )
132
 
133
  with gr.Accordion(
134
+ f"⏳ Pending Assessment Queue ({len(pending_eval_queue_df)})",
135
  open=False,
136
  ):
137
  with gr.Row():
 
142
  row_count=5,
143
  )
144
  with gr.Row():
145
+ gr.Markdown("# βœ‰οΈβœ¨ Submit a library for vulnerability assessment", elem_classes="markdown-text")
146
 
147
  with gr.Row():
148
  with gr.Column():
149
+ library_name_textbox = gr.Textbox(label="Library name (org/repo format)")
150
+ library_version_textbox = gr.Textbox(label="Version", placeholder="v1.0.0")
151
+ library_type = gr.Dropdown(
152
+ choices=[t.to_str(" : ") for t in LibraryType if t != LibraryType.Unknown],
153
+ label="Library type",
154
  multiselect=False,
155
  value=None,
156
  interactive=True,
157
  )
158
 
159
  with gr.Column():
160
+ language = gr.Dropdown(
161
+ choices=[i.value.name for i in Language if i != Language.Other],
162
+ label="Programming Language",
163
  multiselect=False,
164
+ value="Python",
165
  interactive=True,
166
  )
167
+ framework = gr.Textbox(label="Framework/Ecosystem (e.g., PyTorch, React)")
168
+ repository_url = gr.Textbox(label="Repository URL")
 
 
 
 
 
 
169
 
170
+ submit_button = gr.Button("Submit for Assessment")
171
  submission_result = gr.Markdown()
172
  submit_button.click(
173
  add_new_eval,
174
  [
175
+ library_name_textbox,
176
+ library_version_textbox,
177
+ repository_url,
178
+ language,
179
+ framework,
180
+ library_type,
181
  ],
182
  submission_result,
183
  )
src/about.py CHANGED
@@ -11,62 +11,96 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
19
  # ---------------------------------------------------
20
 
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
 
 
37
 
38
- """
39
-
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 
44
  ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
 
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
72
  """
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
+ # Risk domains from LibVulnWatch paper
15
+ license = Task("license_validation", "score", "License Risk")
16
+ security = Task("security_assessment", "score", "Security Risk")
17
+ maintenance = Task("maintenance_health", "score", "Maintenance Risk")
18
+ dependency = Task("dependency_management", "score", "Dependency Risk")
19
+ regulatory = Task("regulatory_compliance", "score", "Regulatory Risk")
20
+
21
+ NUM_FEWSHOT = 0 # Not relevant for vulnerability assessment
22
  # ---------------------------------------------------
23
 
24
 
25
 
26
  # Your leaderboard name
27
+ TITLE = """<h1 align="center" id="space-title">LibVulnWatch: Vulnerability Assessment Leaderboard</h1>"""
28
 
29
  # What does your leaderboard evaluate?
30
  INTRODUCTION_TEXT = """
31
+ ## Systematic Vulnerability Assessment and Leaderboard Tracking for Open-Source AI Libraries
32
+
33
+ This leaderboard provides continuous vulnerability assessment for open-source AI libraries across five critical risk domains:
34
+ - **License Validation**: Legal risks based on license type, compatibility, and requirements
35
+ - **Security Assessment**: Vulnerability severity and patch responsiveness
36
+ - **Maintenance Health**: Sustainability and governance practices
37
+ - **Dependency Management**: Vulnerability inheritance and supply chain security
38
+ - **Regulatory Compliance**: Compliance readiness for various frameworks
39
+
40
+ Lower scores indicate fewer vulnerabilities and lower risk. The overall risk score is a weighted average of all domains, with security given higher priority.
41
  """
42
 
43
  # Which evaluations are you running? how can people reproduce what you have?
44
  LLM_BENCHMARKS_TEXT = f"""
45
+ ## How LibVulnWatch Works
46
 
47
+ Our assessment methodology evaluates libraries through:
48
+ 1. **Static Analysis**: Code review, license parsing, and documentation examination
49
+ 2. **Dynamic Analysis**: Vulnerability scanning, dependency checking, and API testing
50
+ 3. **Metadata Analysis**: Repository metrics, contributor patterns, and release cadence
51
 
52
+ Each library receives a risk score (0-10) in each domain, with lower scores indicating lower risk.
 
 
 
53
 
54
+ ## Reproducibility
55
+ To reproduce our assessment for a specific library:
56
  ```python
57
+ from libvulnwatch import VulnerabilityAssessor
 
 
 
 
 
 
 
 
58
 
59
+ # Initialize the assessor
60
+ assessor = VulnerabilityAssessor()
61
 
62
+ # Run assessment on a library
63
+ results = assessor.assess_library("organization/library_name")
64
 
65
+ # View detailed results
66
+ print(results.risk_scores)
67
+ print(results.detailed_findings)
68
+ ```
69
+ """
70
 
71
+ EVALUATION_QUEUE_TEXT = """
72
+ ## Before submitting a library for assessment
73
+
74
+ ### 1) Ensure your library is publicly accessible
75
+ LibVulnWatch can only assess libraries that are publicly available on GitHub or another accessible repository.
76
+
77
+ ### 2) Verify complete metadata is available
78
+ Our assessment relies on metadata including:
79
+ - License information
80
+ - Dependency specifications
81
+ - Maintenance history and contributor information
82
+ - Security policies and vulnerability handling processes
83
+
84
+ ### 3) Make sure your repository has an open license
85
+ This leaderboard is designed for open-source AI libraries, which should have clear licensing terms.
86
+
87
+ ### 4) Add security documentation
88
+ Libraries with comprehensive security documentation tend to receive better assessments.
89
+
90
+ ## If your assessment fails
91
+ If your library shows as "FAILED" in the assessment queue, check that:
92
+ - The repository is publicly accessible
93
+ - All required metadata files are present
94
+ - Dependencies can be resolved
95
+ - The repository doesn't employ obfuscation techniques that interfere with analysis
96
  """
97
 
98
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
99
  CITATION_BUTTON_TEXT = r"""
100
+ @article{LibVulnWatch2025,
101
+ title={LibVulnWatch: Systematic Vulnerability Assessment and Leaderboard Tracking for Open-Source AI Libraries},
102
+ author={First Author and Second Author},
103
+ journal={ICML 2025 Technical AI Governance Workshop},
104
+ year={2025}
105
+ }
106
  """
src/display/formatting.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
@@ -25,3 +29,44 @@ def has_no_nan_values(df, columns):
25
 
26
  def has_nan_values(df, columns):
27
  return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper functions to style our gradio elements"""
2
+
3
+ import re
4
+
5
  def model_hyperlink(link, model_name):
6
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
7
 
 
29
 
30
  def has_nan_values(df, columns):
31
  return df[columns].isna().any(axis=1)
32
+
33
+
34
+ def make_clickable_library(library_name: str) -> str:
35
+ """Link to the GitHub repository"""
36
+ library_path = library_name.replace(" ", "-").lower()
37
+
38
+ # If this is a GitHub repository, link directly
39
+ github_url = f"https://github.com/{library_path}"
40
+
41
+ return f'<a href="{github_url}" target="_blank">{library_name}</a>'
42
+
43
+
44
+ def styled_message(message) -> str:
45
+ """Format a message with a green header"""
46
+ return f'<span style="color: green">βœ… Success:</span> {message}'
47
+
48
+
49
+ def styled_warning(message) -> str:
50
+ """Format a warning message with an orange header"""
51
+ return f'<span style="color: orange">⚠️ Warning:</span> {message}'
52
+
53
+
54
+ def styled_error(message) -> str:
55
+ """Format an error message with a red header"""
56
+ return f'<span style="color: red">❌ Error:</span> {message}'
57
+
58
+
59
+ # Risk severity coloring for risk scores
60
+ def colorize_risk_score(score):
61
+ """
62
+ Apply color coding to risk scores:
63
+ 0-3.9: Green (Low risk)
64
+ 4-6.9: Orange (Medium risk)
65
+ 7-10: Red (High risk)
66
+ """
67
+ if score < 4:
68
+ return f'<span style="color: green">{score:.1f}</span>'
69
+ elif score < 7:
70
+ return f'<span style="color: orange">{score:.1f}</span>'
71
+ else:
72
+ return f'<span style="color: red">{score:.1f}</span>'
src/display/utils.py CHANGED
@@ -23,22 +23,22 @@ class ColumnContent:
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -46,59 +46,58 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
- model = ColumnContent("model", "markdown", True)
50
- revision = ColumnContent("revision", "str", True)
51
- private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
56
- ## All the model information that we might need
57
  @dataclass
58
- class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
  symbol: str = "" # emoji
62
 
63
 
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟒")
66
- FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
 
70
 
71
  def to_str(self, separator=" "):
72
  return f"{self.value.symbol}{separator}{self.value.name}"
73
 
74
  @staticmethod
75
  def from_str(type):
76
- if "fine-tuned" in type or "πŸ”Ά" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟒" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "β­•" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
+ auto_eval_column_dict.append(["library_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["library", ColumnContent, ColumnContent("Library", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["overall_risk", ColumnContent, ColumnContent("Overall Risk ⬇️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
+ # Library information
33
+ auto_eval_column_dict.append(["library_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ auto_eval_column_dict.append(["framework", ColumnContent, ColumnContent("Framework", "str", False)])
35
+ auto_eval_column_dict.append(["version", ColumnContent, ColumnContent("Version", "str", False, False)])
36
+ auto_eval_column_dict.append(["language", ColumnContent, ColumnContent("Language", "str", False)])
37
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", True)])
38
+ auto_eval_column_dict.append(["stars", ColumnContent, ColumnContent("GitHub ⭐", "number", False)])
39
+ auto_eval_column_dict.append(["last_update", ColumnContent, ColumnContent("Last Updated", "str", False)])
40
+ auto_eval_column_dict.append(["verified", ColumnContent, ColumnContent("Independently Verified", "bool", False)])
41
+ auto_eval_column_dict.append(["availability", ColumnContent, ColumnContent("Active Maintenance", "bool", True)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
+ library = ColumnContent("library", "markdown", True)
50
+ version = ColumnContent("version", "str", True)
51
+ language = ColumnContent("language", "str", True)
52
+ framework = ColumnContent("framework", "str", True)
53
+ library_type = ColumnContent("library_type", "str", True)
54
  status = ColumnContent("status", "str", True)
55
 
56
+ ## All the library information that we might need
57
  @dataclass
58
+ class LibraryDetails:
59
  name: str
60
  display_name: str = ""
61
  symbol: str = "" # emoji
62
 
63
 
64
+ class LibraryType(Enum):
65
+ ML = LibraryDetails(name="machine learning", symbol="🟒")
66
+ LLM = LibraryDetails(name="llm framework", symbol="πŸ”Ά")
67
+ AGENT = LibraryDetails(name="agent framework", symbol="β­•")
68
+ VIS = LibraryDetails(name="visualization", symbol="🟦")
69
+ GENERAL = LibraryDetails(name="general ai", symbol="🟣")
70
+ Unknown = LibraryDetails(name="", symbol="?")
71
 
72
  def to_str(self, separator=" "):
73
  return f"{self.value.symbol}{separator}{self.value.name}"
74
 
75
  @staticmethod
76
  def from_str(type):
77
+ if "machine learning" in type or "🟒" in type:
78
+ return LibraryType.ML
79
+ if "llm framework" in type or "πŸ”Ά" in type:
80
+ return LibraryType.LLM
81
+ if "agent framework" in type or "β­•" in type:
82
+ return LibraryType.AGENT
83
+ if "visualization" in type or "🟦" in type:
84
+ return LibraryType.VIS
85
+ if "general ai" in type or "🟣" in type:
86
+ return LibraryType.GENERAL
87
+ return LibraryType.Unknown
88
+
89
+ class Language(Enum):
90
+ Python = LibraryDetails("Python")
91
+ JavaScript = LibraryDetails("JavaScript")
92
+ TypeScript = LibraryDetails("TypeScript")
93
+ Java = LibraryDetails("Java")
94
+ CPP = LibraryDetails("C++")
95
+ Other = LibraryDetails("Other")
96
+
97
+ class AssessmentStatus(Enum):
98
+ Verified = LibraryDetails("Verified")
99
+ Unverified = LibraryDetails("Unverified")
100
+ Disputed = LibraryDetails("Disputed")
 
 
101
 
102
  # Column selection
103
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
src/envs.py CHANGED
@@ -6,20 +6,20 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "libvulnwatch" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/vulnerability-requests"
14
+ RESULTS_REPO = f"{OWNER}/vulnerability-assessments"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "assessment-queue")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "assessment-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "assessment-queue-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "assessment-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -3,144 +3,156 @@ import json
3
  import math
4
  import os
5
  from dataclasses import dataclass
 
6
 
7
- import dateutil
8
  import numpy as np
9
 
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
- class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
  """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
  org: str
22
- model: str
23
- revision: str # commit hash, "" if main
24
- results: dict
25
- precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
- likes: int = 0
31
- num_params: int = 0
32
- date: str = "" # submission date of request file
33
- still_on_hub: bool = False
34
 
35
  @classmethod
36
  def init_from_json_file(self, json_filepath):
37
- """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
41
- config = data.get("config")
 
 
 
42
 
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
  org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
  else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
- # Extract results available in this file (some results are split in several files)
70
- results = {}
71
  for task in Tasks:
72
- task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
-
 
 
 
 
 
 
 
 
 
 
82
  return self(
83
- eval_name=result_key,
84
- full_model=full_model,
85
  org=org,
86
- model=model,
87
- results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
 
 
 
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
 
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
  def to_dict(self):
111
- """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  data_dict = {
114
- "eval_name": self.eval_name, # not a column, just a save name,
115
- AutoEvalColumn.precision.name: self.precision.value.name,
116
- AutoEvalColumn.model_type.name: self.model_type.value.name,
117
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
- AutoEvalColumn.architecture.name: self.architecture,
120
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
- AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
123
  AutoEvalColumn.license.name: self.license,
124
- AutoEvalColumn.likes.name: self.likes,
125
- AutoEvalColumn.params.name: self.num_params,
126
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
127
  }
128
 
129
  for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
 
132
  return data_dict
133
 
134
 
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
138
  requests_path,
139
- f"{model_name}_eval_request_*.json",
140
  )
141
  request_files = glob.glob(request_files)
142
 
143
- # Select correct request file (precision)
144
  request_file = ""
145
  request_files = sorted(request_files, reverse=True)
146
  for tmp_request_file in request_files:
@@ -148,45 +160,45 @@ def get_request_file_for_model(requests_path, model_name, precision):
148
  req_content = json.load(f)
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
  ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
- """From the path of the results folder root, extract all needed info for results"""
159
- model_result_filepaths = []
160
 
161
  for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
  continue
165
 
166
- # Sort the files by date
167
  try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
 
172
  for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
 
175
- eval_results = {}
176
- for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
- eval_name = eval_result.eval_name
183
- if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
  else:
186
- eval_results[eval_name] = eval_result
187
 
188
  results = []
189
- for v in eval_results.values():
190
  try:
191
  v.to_dict() # we test if the dict version is complete
192
  results.append(v)
 
3
  import math
4
  import os
5
  from dataclasses import dataclass
6
+ from datetime import datetime
7
 
 
8
  import numpy as np
9
 
10
+ from src.display.formatting import make_clickable_library
11
+ from src.display.utils import AutoEvalColumn, LibraryType, Tasks, Language, AssessmentStatus
 
12
 
13
 
14
  @dataclass
15
+ class AssessmentResult:
16
+ """Represents one full vulnerability assessment. Built from a combination of the result and request file for a given library.
17
  """
18
+ assessment_id: str # Unique identifier
19
+ library_name: str # org/repo
20
  org: str
21
+ repo: str
22
+ version: str
23
+ results: dict # Risk scores
24
+ framework: str = ""
25
+ language: Language = Language.Other
26
+ library_type: LibraryType = LibraryType.Unknown
 
27
  license: str = "?"
28
+ stars: int = 0
29
+ last_update: str = ""
30
+ availability: bool = True
31
+ verified: bool = False
32
 
33
  @classmethod
34
  def init_from_json_file(self, json_filepath):
35
+ """Initializes the assessment result from a JSON file"""
36
  with open(json_filepath) as fp:
37
  data = json.load(fp)
38
 
39
+ assessment = data.get("assessment", {})
40
+ # Get library and org
41
+ library_name = assessment.get("library_name", "")
42
+ org_and_repo = library_name.split("/", 1)
43
 
44
+ if len(org_and_repo) == 1:
 
 
 
 
 
 
 
45
  org = None
46
+ repo = org_and_repo[0]
47
+ assessment_id = f"{repo}_{assessment.get('version', '')}"
48
  else:
49
+ org = org_and_repo[0]
50
+ repo = org_and_repo[1]
51
+ assessment_id = f"{org}_{repo}_{assessment.get('version', '')}"
 
52
 
53
+ # Extract risk scores
54
+ risk_scores = {}
 
 
 
 
 
 
 
 
 
55
  for task in Tasks:
56
+ domain = task.value
57
+ score = assessment.get("scores", {}).get(domain.benchmark, None)
58
+ if score is not None:
59
+ risk_scores[domain.benchmark] = score
60
+
61
+ # Library metadata
62
+ framework = assessment.get("framework", "")
63
+ language_str = assessment.get("language", "Other")
64
+ language = next((lang for lang in Language if lang.value.name == language_str), Language.Other)
65
+
66
+ # Availability and verification
67
+ last_update = assessment.get("last_updated", "")
68
+ if last_update:
69
+ try:
70
+ # Format date for display
71
+ dt = datetime.fromisoformat(last_update)
72
+ last_update = dt.strftime("%Y-%m-%d")
73
+ except:
74
+ pass
75
+
76
  return self(
77
+ assessment_id=assessment_id,
78
+ library_name=library_name,
79
  org=org,
80
+ repo=repo,
81
+ version=assessment.get("version", ""),
82
+ results=risk_scores,
83
+ framework=framework,
84
+ language=language,
85
+ license=assessment.get("license", "?"),
86
+ availability=assessment.get("active_maintenance", True),
87
+ verified=assessment.get("independently_verified", False),
88
+ last_update=last_update,
89
  )
90
 
91
  def update_with_request_file(self, requests_path):
92
+ """Finds the relevant request file for the current library and updates info with it"""
93
+ request_file = get_request_file_for_library(requests_path, self.library_name, self.version)
94
 
95
  try:
96
  with open(request_file, "r") as f:
97
  request = json.load(f)
98
+ self.library_type = LibraryType.from_str(request.get("library_type", ""))
99
+ self.stars = request.get("stars", 0)
 
 
 
 
100
  except Exception:
101
+ print(f"Could not find request file for {self.library_name} version {self.version}")
102
 
103
  def to_dict(self):
104
+ """Converts the Assessment Result to a dict compatible with our dataframe display"""
105
+ # Calculate overall risk as weighted average
106
+ weights = {
107
+ "license_validation": 0.2,
108
+ "security_assessment": 0.3,
109
+ "maintenance_health": 0.2,
110
+ "dependency_management": 0.2,
111
+ "regulatory_compliance": 0.1
112
+ }
113
+
114
+ # Calculate overall risk - if domain is missing, use highest risk score (10)
115
+ risk_sum = 0
116
+ weight_sum = 0
117
+
118
+ for domain, weight in weights.items():
119
+ score = self.results.get(domain, 10) # Default to highest risk if missing
120
+ risk_sum += score * weight
121
+ weight_sum += weight
122
+
123
+ overall_risk = risk_sum / weight_sum if weight_sum > 0 else 10
124
+
125
  data_dict = {
126
+ "assessment_id": self.assessment_id, # not a column, just a save name
127
+ AutoEvalColumn.library_type.name: self.library_type.value.name,
128
+ AutoEvalColumn.library_type_symbol.name: self.library_type.value.symbol,
129
+ AutoEvalColumn.language.name: self.language.value.name,
130
+ AutoEvalColumn.framework.name: self.framework,
131
+ AutoEvalColumn.library.name: make_clickable_library(self.library_name),
132
+ AutoEvalColumn.version.name: self.version,
133
+ AutoEvalColumn.overall_risk.name: overall_risk,
 
134
  AutoEvalColumn.license.name: self.license,
135
+ AutoEvalColumn.stars.name: self.stars,
136
+ AutoEvalColumn.last_update.name: self.last_update,
137
+ AutoEvalColumn.verified.name: self.verified,
138
+ AutoEvalColumn.availability.name: self.availability,
139
  }
140
 
141
  for task in Tasks:
142
+ data_dict[task.name] = self.results.get(task.value.benchmark, 10) # Default to highest risk
143
 
144
  return data_dict
145
 
146
 
147
+ def get_request_file_for_library(requests_path, library_name, version):
148
+ """Selects the correct request file for a given library. Only keeps runs tagged as FINISHED"""
149
  request_files = os.path.join(
150
  requests_path,
151
+ f"{library_name.replace('/', '_')}_eval_request_*.json",
152
  )
153
  request_files = glob.glob(request_files)
154
 
155
+ # Select correct request file (version)
156
  request_file = ""
157
  request_files = sorted(request_files, reverse=True)
158
  for tmp_request_file in request_files:
 
160
  req_content = json.load(f)
161
  if (
162
  req_content["status"] in ["FINISHED"]
163
+ and req_content["version"] == version
164
  ):
165
  request_file = tmp_request_file
166
  return request_file
167
 
168
 
169
+ def get_raw_assessment_results(results_path: str, requests_path: str) -> list[AssessmentResult]:
170
+ """From the path of the results folder root, extract all needed info for assessments"""
171
+ assessment_filepaths = []
172
 
173
  for root, _, files in os.walk(results_path):
174
+ # We should only have json files in assessment results
175
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
176
  continue
177
 
178
+ # Sort the files by date if they have date info
179
  try:
180
+ files.sort(key=lambda x: datetime.fromisoformat(json.loads(open(os.path.join(root, x)).read())["assessment"]["completed_time"]), reverse=True)
181
+ except:
182
+ pass
183
 
184
  for file in files:
185
+ assessment_filepaths.append(os.path.join(root, file))
186
 
187
+ assessment_results = {}
188
+ for assessment_filepath in assessment_filepaths:
189
  # Creation of result
190
+ assessment_result = AssessmentResult.init_from_json_file(assessment_filepath)
191
+ assessment_result.update_with_request_file(requests_path)
192
 
193
  # Store results of same eval together
194
+ assessment_id = assessment_result.assessment_id
195
+ if assessment_id in assessment_results.keys():
196
+ assessment_results[assessment_id].results.update({k: v for k, v in assessment_result.results.items() if v is not None})
197
  else:
198
+ assessment_results[assessment_id] = assessment_result
199
 
200
  results = []
201
+ for v in assessment_results.values():
202
  try:
203
  v.to_dict() # we test if the dict version is complete
204
  results.append(v)
src/populate.py CHANGED
@@ -1,58 +1,100 @@
1
- import json
2
- import os
3
 
4
  import pandas as pd
5
 
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
- return df
23
-
24
-
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Functions to populate the leaderboard"""
 
2
 
3
  import pandas as pd
4
 
5
+ from src.display.utils import AutoEvalColumn
6
+ from src.leaderboard.read_evals import get_raw_assessment_results
 
7
 
8
 
9
+ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
10
+ """Read all the runs in the folder and return a dataframe
11
+
12
+ Args:
13
+ eval_results_path: Path to the assessment result files
14
+ eval_requests_path: Path to the assessment request files
15
+ cols: Columns names to include in the dataframe
16
+ benchmark_cols: Risk categories column names
17
+
18
+ Returns:
19
+ Pandas dataframe for the leaderboard
20
+ """
21
+ try:
22
+ assessment_results = get_raw_assessment_results(eval_results_path, eval_requests_path)
23
+
24
+ # If we get results, convert to dataframe
25
+ if len(assessment_results) > 0:
26
+ # Create dataframe from assessment results
27
+ all_df = pd.DataFrame.from_records([r.to_dict() for r in assessment_results])
28
+
29
+ # Sort by overall risk score (ascending - lower is better)
30
+ if AutoEvalColumn.overall_risk.name in all_df.columns:
31
+ all_df = all_df.sort_values(by=[AutoEvalColumn.overall_risk.name])
32
+
33
+ return all_df
34
+
35
+ return pd.DataFrame(columns=cols) # Empty dataframe with columns
36
+ except Exception as e:
37
+ print(f"Error reading evaluation results: {e}")
38
+ return pd.DataFrame(columns=cols) # Return empty dataframe
39
 
 
 
 
40
 
41
+ def get_evaluation_queue_df(eval_requests_path, eval_cols):
42
+ """Read from the evaluation queue directory and return dataframes for each status
43
+
44
+ Args:
45
+ eval_requests_path: Path to the assessment request files
46
+ eval_cols: Columns for the queue dataframes
47
+
48
+ Returns:
49
+ Tuple of dataframes (finished, running, pending)
50
+ """
51
+ try:
52
+ import glob
53
+ import json
54
+ import os
55
+
56
+ # Find all request files
57
+ request_files = glob.glob(os.path.join(eval_requests_path, "*.json"))
58
+
59
+ finished_data = []
60
+ running_data = []
61
+ pending_data = []
62
+
63
+ for file_path in request_files:
64
+ try:
65
+ with open(file_path, "r") as f:
66
+ data = json.load(f)
67
+
68
+ # Extract relevant fields
69
+ row = {
70
+ "library": data.get("library", ""),
71
+ "version": data.get("version", ""),
72
+ "language": data.get("language", ""),
73
+ "framework": data.get("framework", ""),
74
+ "library_type": data.get("library_type", ""),
75
+ "status": data.get("status", "UNKNOWN")
76
+ }
77
+
78
+ # Add to appropriate dataframe based on status
79
+ if row["status"] == "FINISHED":
80
+ finished_data.append(row)
81
+ elif row["status"] == "RUNNING":
82
+ running_data.append(row)
83
+ elif row["status"] == "PENDING":
84
+ pending_data.append(row)
85
+ except Exception as e:
86
+ print(f"Error reading request file {file_path}: {e}")
87
+ continue
88
+
89
+ # Convert to dataframes
90
+ finished_df = pd.DataFrame(finished_data, columns=eval_cols)
91
+ running_df = pd.DataFrame(running_data, columns=eval_cols)
92
+ pending_df = pd.DataFrame(pending_data, columns=eval_cols)
93
+
94
+ return finished_df, running_df, pending_df
95
+
96
+ except Exception as e:
97
+ print(f"Error reading evaluation queue: {e}")
98
+ # Return empty dataframes
99
+ empty_df = pd.DataFrame(columns=eval_cols)
100
+ return empty_df.copy(), empty_df.copy(), empty_df.copy()
src/submission/check_validity.py CHANGED
@@ -1,99 +1,113 @@
1
  import json
2
  import os
3
  import re
 
4
  from collections import defaultdict
5
  from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
 
 
 
 
 
 
36
  try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
  except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
 
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
  if current_depth == depth:
86
  for file in files:
87
  if not file.endswith(".json"):
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
 
93
  # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
  continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
 
99
- return set(file_names), users_to_submission_dates
 
1
  import json
2
  import os
3
  import re
4
+ import requests
5
  from collections import defaultdict
6
  from datetime import datetime, timedelta, timezone
7
+ from typing import Dict, Tuple, Any, List, Set
8
+
9
+ def is_repository_valid(repo_name: str, repo_url: str) -> Tuple[bool, str, Dict[str, Any]]:
10
+ """
11
+ Checks if a GitHub repository is valid and accessible.
12
+
13
+ Args:
14
+ repo_name: The name of the repository (org/repo format)
15
+ repo_url: URL to the repository
16
+
17
+ Returns:
18
+ Tuple of (is_valid, error_message, library_info)
19
+ """
20
+ # Basic format validation
21
+ if not repo_name or "/" not in repo_name:
22
+ return False, "Repository name must be in the format 'organization/repository'", {}
23
+
24
+ # Check if GitHub URL
25
+ if repo_url and "github.com" in repo_url:
26
+ # Extract org and repo from URL if provided
27
+ try:
28
+ parts = repo_url.split("github.com/")[1].split("/")
29
+ org = parts[0]
30
+ repo = parts[1].split(".")[0] if "." in parts[1] else parts[1]
31
+ url_repo_name = f"{org}/{repo}"
32
+
33
+ # Check if URL matches repo_name
34
+ if url_repo_name != repo_name:
35
+ return False, f"Repository name ({repo_name}) doesn't match the URL ({url_repo_name})", {}
36
+ except:
37
+ pass # Fall back to using repo_name
38
+
39
+ # Get repository information from GitHub API
40
+ org, repo = repo_name.split("/")
41
+ api_url = f"https://api.github.com/repos/{org}/{repo}"
42
+
43
  try:
44
+ response = requests.get(api_url)
45
+ if response.status_code != 200:
46
+ return False, f"Repository not found or not accessible: {response.json().get('message', 'Unknown error')}", {}
47
+
48
+ # Parse repository data
49
+ repo_data = response.json()
50
+ library_info = get_library_info(repo_data)
51
+
52
+ return True, "", library_info
53
+
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
+ return False, f"Error accessing repository: {str(e)}", {}
56
+
57
+ def get_library_info(repo_data: Dict[str, Any]) -> Dict[str, Any]:
58
+ """
59
+ Extracts relevant information from GitHub repository data.
60
+
61
+ Args:
62
+ repo_data: GitHub API response for a repository
63
+
64
+ Returns:
65
+ Dictionary with library metadata
66
+ """
67
+ library_info = {
68
+ "name": repo_data.get("name", ""),
69
+ "full_name": repo_data.get("full_name", ""),
70
+ "description": repo_data.get("description", ""),
71
+ "stars": repo_data.get("stargazers_count", 0),
72
+ "forks": repo_data.get("forks_count", 0),
73
+ "license": repo_data.get("license", {}).get("name", "Unknown"),
74
+ "created_at": repo_data.get("created_at", ""),
75
+ "updated_at": repo_data.get("updated_at", ""),
76
+ "open_issues": repo_data.get("open_issues_count", 0),
77
+ "default_branch": repo_data.get("default_branch", "main"),
78
+ "is_archived": repo_data.get("archived", False),
79
+ }
80
+
81
+ return library_info
82
+
83
+ def already_submitted_libraries(requested_libraries_dir: str) -> Tuple[Set[str], Dict[str, List[str]]]:
84
+ """
85
+ Gathers a list of already submitted libraries to avoid duplicates.
86
+
87
+ Args:
88
+ requested_libraries_dir: Directory with library assessment requests
89
+
90
+ Returns:
91
+ Tuple of (set of library identifiers, dict mapping orgs to submission dates)
92
+ """
93
  depth = 1
94
+ library_ids = []
95
+ orgs_to_submission_dates = defaultdict(list)
96
 
97
+ for root, _, files in os.walk(requested_libraries_dir):
98
+ current_depth = root.count(os.sep) - requested_libraries_dir.count(os.sep)
99
  if current_depth == depth:
100
  for file in files:
101
  if not file.endswith(".json"):
102
  continue
103
  with open(os.path.join(root, file), "r") as f:
104
  info = json.load(f)
105
+ library_ids.append(f"{info['library']}_{info['version']}")
106
 
107
  # Select organisation
108
+ if info["library"].count("/") == 0 or "submitted_time" not in info:
109
  continue
110
+ organisation, _ = info["library"].split("/")
111
+ orgs_to_submission_dates[organisation].append(info["submitted_time"])
112
 
113
+ return set(library_ids), orgs_to_submission_dates
src/submission/submit.py CHANGED
@@ -1,119 +1,94 @@
1
  import json
2
  import os
3
- from datetime import datetime, timezone
 
 
 
 
4
 
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
 
16
 
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
 
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
  }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )
 
1
  import json
2
  import os
3
+ import re
4
+ import time
5
+ import uuid
6
+ from datetime import datetime
7
+ from pathlib import Path
8
 
9
+ import huggingface_hub
10
+ import requests
11
+ from huggingface_hub import HfApi
 
 
 
 
 
12
 
13
+ from src.display.utils import LibraryType, Language, AssessmentStatus
14
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
15
+ from src.submission.check_validity import is_repository_valid, get_library_info
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ def add_new_eval(
19
+ library_name,
20
+ library_version,
21
+ repository_url,
22
+ language,
23
+ framework,
24
+ library_type_str,
25
+ ) -> str:
26
+ """
27
+ Adds a new library to the assessment queue.
28
+
29
+ Args:
30
+ library_name: Name of the library (org/repo format)
31
+ library_version: Version of the library
32
+ repository_url: URL to the repository
33
+ language: Programming language
34
+ framework: Related framework/ecosystem
35
+ library_type_str: Type of AI library
36
+
37
+ Returns:
38
+ A message indicating the status of the submission
39
+ """
40
+ # Check if valid repository
41
+ is_valid, validity_message, library_info = is_repository_valid(library_name, repository_url)
42
+
43
+ if not is_valid:
44
+ return f"⚠️ Invalid submission: {validity_message}"
45
+
46
+ # Parse library type
47
+ library_type = LibraryType.from_str(library_type_str)
48
+ if library_type == LibraryType.Unknown:
49
+ return "⚠️ Please select a valid library type."
50
+
51
+ # Create a unique identifier for the submission
52
+ uid = uuid.uuid4().hex[:6]
53
+ timestamp = datetime.now().isoformat()
54
+ request_filename = f"{library_name.replace('/', '_')}_eval_request_{timestamp}_{uid}.json"
55
+
56
+ # Stars count and license info from library_info if available
57
+ stars = library_info.get("stars", 0)
58
+ license_name = library_info.get("license", "unknown")
59
+
60
+ # Create the assessment request JSON
61
+ assessment_request = {
62
+ "library": library_name,
63
+ "version": library_version,
64
+ "repository_url": repository_url,
65
+ "language": language,
66
+ "framework": framework,
67
+ "library_type": library_type.value.name,
68
+ "license": license_name,
69
+ "stars": stars,
70
  "status": "PENDING",
71
+ "submitted_time": timestamp,
72
+ "last_updated": timestamp,
73
+ "assessment_id": uid
 
 
 
74
  }
75
+
76
+ # Save the request
77
+ os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
78
+ with open(os.path.join(EVAL_REQUESTS_PATH, request_filename), "w") as f:
79
+ json.dump(assessment_request, f, indent=2)
80
+
81
+ try:
82
+ # Push the file to the HF repo
83
+ path = Path(os.path.join(EVAL_REQUESTS_PATH, request_filename))
84
+ API.upload_file(
85
+ path_or_fileobj=path,
86
+ path_in_repo=request_filename,
87
+ repo_id=QUEUE_REPO,
88
+ repo_type="dataset",
89
+ )
90
+
91
+ return f"βœ… Library '{library_name}' (version {library_version}) has been added to the assessment queue! Assessment ID: {uid}"
92
+
93
+ except Exception as e:
94
+ return f"Error uploading assessment request: {str(e)}"