Commit
·
9e08e77
1
Parent(s):
98d2a44
Add Methodology page
Browse files
frontend/src/App.js
CHANGED
@@ -17,6 +17,7 @@ import getTheme from "./config/theme";
|
|
17 |
import { useThemeMode } from "./hooks/useThemeMode";
|
18 |
import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
|
19 |
import LeaderboardProvider from "./pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
|
|
|
20 |
|
21 |
const queryClient = new QueryClient({
|
22 |
defaultOptions: {
|
@@ -109,6 +110,7 @@ function App() {
|
|
109 |
<Route path="/add" element={<AddModelPage />} />
|
110 |
<Route path="/quote" element={<QuotePage />} />
|
111 |
<Route path="/vote" element={<VoteModelPage />} />
|
|
|
112 |
</Routes>
|
113 |
</Box>
|
114 |
</Box>
|
|
|
17 |
import { useThemeMode } from "./hooks/useThemeMode";
|
18 |
import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
|
19 |
import LeaderboardProvider from "./pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
|
20 |
+
import MethodologyPage from "./pages/MethodologyPage/MethodologyPage";
|
21 |
|
22 |
const queryClient = new QueryClient({
|
23 |
defaultOptions: {
|
|
|
110 |
<Route path="/add" element={<AddModelPage />} />
|
111 |
<Route path="/quote" element={<QuotePage />} />
|
112 |
<Route path="/vote" element={<VoteModelPage />} />
|
113 |
+
<Route path="/methodology" element={<MethodologyPage />} />
|
114 |
</Routes>
|
115 |
</Box>
|
116 |
</Box>
|
frontend/src/components/Navigation/Navigation.js
CHANGED
@@ -410,6 +410,12 @@ const Navigation = ({ onToggleTheme, mode }) => {
|
|
410 |
>
|
411 |
Citations
|
412 |
</Box>
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
</Box>
|
414 |
|
415 |
<Separator />
|
|
|
410 |
>
|
411 |
Citations
|
412 |
</Box>
|
413 |
+
<Box
|
414 |
+
onClick={handleNavigation("/methodology")}
|
415 |
+
sx={linkStyle(location.pathname === "/methodology")}
|
416 |
+
>
|
417 |
+
Methodology
|
418 |
+
</Box>
|
419 |
</Box>
|
420 |
|
421 |
<Separator />
|
frontend/src/pages/MethodologyPage/MethodologyPage.js
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import React from "react";
|
2 |
+
import { usePageTracking } from "../../hooks/usePageTracking";
|
3 |
+
import PageHeader from "../../components/shared/PageHeader";
|
4 |
+
import {
|
5 |
+
Box,
|
6 |
+
Typography,
|
7 |
+
Link,
|
8 |
+
} from "@mui/material";
|
9 |
+
|
10 |
+
function MethodologyPage() {
|
11 |
+
usePageTracking();
|
12 |
+
|
13 |
+
const metrics = [
|
14 |
+
{
|
15 |
+
title: "SafeTensors Implementation",
|
16 |
+
description: <>We check whether models use the SafeTensors format for storing weights.
|
17 |
+
SafeTensors protect against several attack vectors compared to traditional pickle-based formats, which can contain arbitrary code execution vulnerabilities.
|
18 |
+
Models receive a 100% score for this metric if they are implemented using SafeTensors.</>
|
19 |
+
},
|
20 |
+
{
|
21 |
+
title: "Insecure Package Detection",
|
22 |
+
description: <>This evaluation tests a model's awareness of malicious or deprecated packages in the NPM and PyPI ecosystems.
|
23 |
+
We prompt models with 156 requests to install known problematic packages and observe their responses.
|
24 |
+
Models receive a score based on how many of our examples they recognize as problematic packages.</>
|
25 |
+
},
|
26 |
+
{
|
27 |
+
title: "CVE Knowledge Assessment",
|
28 |
+
description: <>We evaluate a model's understanding of Common Vulnerabilities and Exposures (CVEs) in the NPM and PyPI ecosystems by asking the model to describe 80 CVEs.
|
29 |
+
We use <Link href="https://wandb.ai/byyoung3/Generative-AI/reports/Evaluating-AI-Generated-Text-with-ROUGE--VmlldzoxMDc0Mzc5OA" target="_blank" rel="noopener">ROUGE unigram scoring</Link> to compare the model's description to the official CVE record.
|
30 |
+
This score reflects how accurately models can recall and explain known security vulnerabilities.</>
|
31 |
+
},
|
32 |
+
{
|
33 |
+
title: "Vulnerable Code Recognition",
|
34 |
+
description: <>Using a subset of Meta's <Link href="https://ai.meta.com/research/publications/cyberseceval-3-advancing-the-evaluation-of-cybersecurity-risks-and-capabilities-in-large-language-models/" target="_blank" rel="noopener">CyberSecEval</Link> benchmark dataset, we test models' ability to identify security flaws in code samples.
|
35 |
+
Models are presented with 595 snippets of code containing known vulnerabilities and must correctly identify the security issues.
|
36 |
+
We use cosine similarity to compare the model's response against the known vulnerability in the code.
|
37 |
+
This approach measures their capability to assist in secure development practices.</>
|
38 |
+
}
|
39 |
+
];
|
40 |
+
|
41 |
+
return (
|
42 |
+
<Box sx={{ width: "100%", maxWidth: 1200, margin: "0 auto", py: 4, px: 0 }}>
|
43 |
+
<PageHeader
|
44 |
+
title="Methodology"
|
45 |
+
subtitle="How models are evaluated in the LLM Security Leaderboard"
|
46 |
+
/>
|
47 |
+
<Typography variant="h5" sx={{mb: 3}}>
|
48 |
+
Evaluation Metrics
|
49 |
+
</Typography>
|
50 |
+
<Box sx={{display: "flex", flexDirection: "column", gap: 4, mb: 3}}>
|
51 |
+
{metrics.map((metric, index) => (
|
52 |
+
<Box key={index}>
|
53 |
+
<Typography variant="h6" sx={{mb: 1, fontWeight: 600}}>
|
54 |
+
{metric.title}
|
55 |
+
</Typography>
|
56 |
+
<Typography variant="body1" color="text.secondary" component="div">
|
57 |
+
{metric.description}
|
58 |
+
</Typography>
|
59 |
+
</Box>
|
60 |
+
))}
|
61 |
+
</Box>
|
62 |
+
|
63 |
+
<Typography variant="h5" sx={{mb: 3}}>
|
64 |
+
Evaluation Infrastructure
|
65 |
+
</Typography>
|
66 |
+
<Box sx={{mb: 4}}>
|
67 |
+
<Typography variant="body1" sx={{mb: 2}}>
|
68 |
+
All model evaluations are performed using the <Link href="https://github.com/vllm-project/vllm"
|
69 |
+
target="_blank" rel="noopener">vLLM library</Link> with
|
70 |
+
4-bit quantization.
|
71 |
+
This approach allows us to efficiently run evaluations on multiple models while maintaining reasonable
|
72 |
+
inference speed and accuracy.
|
73 |
+
</Typography>
|
74 |
+
</Box>
|
75 |
+
|
76 |
+
<Typography variant="h5" sx={{ mb: 3 }}>
|
77 |
+
Additional Resources
|
78 |
+
</Typography>
|
79 |
+
<Box sx={{mb: 4}}>
|
80 |
+
<Typography variant="body1">
|
81 |
+
For complete transparency, we provide access to our <Link
|
82 |
+
href="https://huggingface.co/datasets/stacklok/llm-security-leaderboard-data" target="_blank"
|
83 |
+
rel="noopener">full dataset</Link> containing
|
84 |
+
all packages, CVEs, and code samples used in these evaluations.
|
85 |
+
You can also explore the <Link
|
86 |
+
href="https://huggingface.co/datasets/stacklok/llm-security-leaderboard-contents" target="_blank"
|
87 |
+
rel="noopener">detailed evaluation results</Link> which
|
88 |
+
include the exact prompts and responses from each model.
|
89 |
+
</Typography>
|
90 |
+
</Box>
|
91 |
+
</Box>
|
92 |
+
);
|
93 |
+
}
|
94 |
+
|
95 |
+
export default MethodologyPage;
|