from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# Risk domains from LibVulnWatch paper
license = Task("license_validation", "score", "License Rating")
security = Task("security_assessment", "score", "Security Rating")
maintenance = Task("maintenance_health", "score", "Maintenance Rating")
dependency = Task("dependency_management", "score", "Dependency Rating")
regulatory = Task("regulatory_compliance", "score", "Regulatory Rating")
NUM_FEWSHOT = 0 # Not relevant for vulnerability assessment
# ---------------------------------------------------
# Your leaderboard name
TITLE = """
LibVulnWatch: Vulnerability Assessment Leaderboard
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
## LibVulnWatch – Continuous, Multi-Domain Risk Scoring for AI Libraries
_As presented at the **ACL 2025 Student Research Workshop** and the **ICML 2025 Technical AI Governance (TAIG) workshop**_, LibVulnWatch provides an evidence-based, end-to-end pipeline that uncovers **hidden vulnerabilities** in open-source AI libraries across five governance-aligned domains:
• **License Validation** – compatibility, provenance, obligations
• **Security Assessment** – CVEs, patch latency, exploit primitives
• **Maintenance Health** – bus-factor, release cadence, contributor diversity
• **Dependency Management** – transitive risk, SBOM completeness
• **Regulatory Compliance** – privacy/export controls, policy documentation
In the paper we apply the framework to **20 popular libraries**, achieving **88 % coverage of OpenSSF Scorecard checks** and surfacing **up to 19 previously-unreported risks per library**.
Lower scores indicate lower risk, and the **Trust Score** is the equal-weight average of the five domains.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## Methodology at a Glance
LibVulnWatch orchestrates a **graph of specialised agents** powered by large language models. Each agent contributes one evidence layer and writes structured findings to a shared memory:
1️⃣ **Static agents** – licence parsing, secret scanning, call-graph reachability
2️⃣ **Dynamic agents** – fuzzing harnesses, dependency-confusion probes, CVE replay
3️⃣ **Metadata agents** – GitHub mining, release-cadence modelling, community health
4️⃣ **Policy agents** – mapping evidence to NIST SSDF, EU AI Act, and related frameworks
The aggregator agent converts raw findings into 0–10 scores per domain, producing a reproducible JSON result that is **88 % compatible with OpenSSF Scorecard checks**. All artefacts (SBOMs, logs, annotated evidence) are archived and linked in the public report.
"""
EVALUATION_QUEUE_TEXT = """
## Before submitting a library for assessment
### 1) Ensure your library is publicly accessible
LibVulnWatch can only assess libraries that are publicly available on GitHub or another accessible repository.
### 2) Verify complete metadata is available
Our assessment relies on metadata including:
- License information
- Dependency specifications
- Maintenance history and contributor information
- Security policies and vulnerability handling processes
### 3) Make sure your repository has an open license
This leaderboard is designed for open-source AI libraries, which should have clear licensing terms.
### 4) Add security documentation
Libraries with comprehensive security documentation tend to receive better assessments.
## If your assessment fails
If your library shows as "FAILED" in the assessment queue, check that:
- The repository is publicly accessible
- All required metadata files are present
- Dependencies can be resolved
- The repository doesn't employ obfuscation techniques that interfere with analysis
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@inproceedings{wu2025libvulnwatch,
title={LibVulnWatch: A Deep Assessment Agent System and Leaderboard for Uncovering Hidden Vulnerabilities in Open-Source {AI} Libraries},
author={Zekun Wu and Seonglae Cho and Umar Mohammed and CRISTIAN ENRIQUE MUNOZ VILLALOBOS and Kleyton Da Costa and Xin Guan and Theo King and Ze Wang and Emre Kazim and Adriano Koshiyama},
booktitle={ACL 2025 Student Research Workshop},
year={2025},
url={https://openreview.net/forum?id=yQzYEAL0BT}
}
@inproceedings{anonymous2025libvulnwatch,
title={LibVulnWatch: A Deep Assessment Agent System and Leaderboard for Uncovering Hidden Vulnerabilities in Open-Source {AI} Libraries},
author={Zekun Wu and Seonglae Cho and Umar Mohammed and CRISTIAN ENRIQUE MUNOZ VILLALOBOS and Kleyton Da Costa and Xin Guan and Theo King and Ze Wang and Emre Kazim and Adriano Koshiyama},
booktitle={ICML Workshop on Technical AI Governance (TAIG)},
year={2025},
url={https://openreview.net/forum?id=MHhrr8QHgR}
}"""