llm-security-leaderboard

Running

App Files Files Community

eleftherias commited on Apr 22

Commit

9e08e77

1 Parent(s): 98d2a44

Add Methodology page

Browse files

Files changed (3) hide show

frontend/src/App.js +2 -0
frontend/src/components/Navigation/Navigation.js +6 -0
frontend/src/pages/MethodologyPage/MethodologyPage.js +95 -0

frontend/src/App.js CHANGED Viewed

@@ -17,6 +17,7 @@ import getTheme from "./config/theme";
 import { useThemeMode } from "./hooks/useThemeMode";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import LeaderboardProvider from "./pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
 const queryClient = new QueryClient({
   defaultOptions: {
@@ -109,6 +110,7 @@ function App() {
                     <Route path="/add" element={<AddModelPage />} />
                     <Route path="/quote" element={<QuotePage />} />
                     <Route path="/vote" element={<VoteModelPage />} />
                   </Routes>
                 </Box>
               </Box>

 import { useThemeMode } from "./hooks/useThemeMode";
 import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
 import LeaderboardProvider from "./pages/LeaderboardPage/components/Leaderboard/context/LeaderboardContext";
+import MethodologyPage from "./pages/MethodologyPage/MethodologyPage";
 const queryClient = new QueryClient({
   defaultOptions: {
                     <Route path="/add" element={<AddModelPage />} />
                     <Route path="/quote" element={<QuotePage />} />
                     <Route path="/vote" element={<VoteModelPage />} />
+                    <Route path="/methodology" element={<MethodologyPage />} />
                   </Routes>
                 </Box>
               </Box>

frontend/src/components/Navigation/Navigation.js CHANGED Viewed

@@ -410,6 +410,12 @@ const Navigation = ({ onToggleTheme, mode }) => {
               >
                 Citations
               </Box>
             </Box>
             <Separator />

               >
                 Citations
               </Box>
+              <Box
+                onClick={handleNavigation("/methodology")}
+                sx={linkStyle(location.pathname === "/methodology")}
+              >
+                Methodology
+              </Box>
             </Box>
             <Separator />

frontend/src/pages/MethodologyPage/MethodologyPage.js ADDED Viewed

	@@ -0,0 +1,95 @@

+import React from "react";
+import { usePageTracking } from "../../hooks/usePageTracking";
+import PageHeader from "../../components/shared/PageHeader";
+import {
+  Box,
+  Typography,
+  Link,
+} from "@mui/material";
+function MethodologyPage() {
+  usePageTracking();
+  const metrics = [
+    {
+      title: "SafeTensors Implementation",
+      description: <>We check whether models use the SafeTensors format for storing weights.
+        SafeTensors protect against several attack vectors compared to traditional pickle-based formats, which can contain arbitrary code execution vulnerabilities.
+        Models receive a 100% score for this metric if they are implemented using SafeTensors.</>
+    },
+    {
+      title: "Insecure Package Detection",
+      description: <>This evaluation tests a model's awareness of malicious or deprecated packages in the NPM and PyPI ecosystems.
+        We prompt models with 156 requests to install known problematic packages and observe their responses.
+        Models receive a score based on how many of our examples they recognize as problematic packages.</>
+    },
+    {
+      title: "CVE Knowledge Assessment",
+      description: <>We evaluate a model's understanding of Common Vulnerabilities and Exposures (CVEs) in the NPM and PyPI ecosystems by asking the model to describe 80 CVEs.
+        We use <Link href="https://wandb.ai/byyoung3/Generative-AI/reports/Evaluating-AI-Generated-Text-with-ROUGE--VmlldzoxMDc0Mzc5OA" target="_blank" rel="noopener">ROUGE unigram scoring</Link> to compare the model's description to the official CVE record.
+        This score reflects how accurately models can recall and explain known security vulnerabilities.</>
+    },
+    {
+      title: "Vulnerable Code Recognition",
+      description: <>Using a subset of Meta's <Link href="https://ai.meta.com/research/publications/cyberseceval-3-advancing-the-evaluation-of-cybersecurity-risks-and-capabilities-in-large-language-models/" target="_blank" rel="noopener">CyberSecEval</Link> benchmark dataset, we test models' ability to identify security flaws in code samples.
+        Models are presented with 595 snippets of code containing known vulnerabilities and must correctly identify the security issues.
+        We use cosine similarity to compare the model's response against the known vulnerability in the code.
+        This approach measures their capability to assist in secure development practices.</>
+    }
+  ];
+  return (
+    <Box sx={{ width: "100%", maxWidth: 1200, margin: "0 auto", py: 4, px: 0 }}>
+      <PageHeader
+        title="Methodology"
+        subtitle="How models are evaluated in the LLM Security Leaderboard"
+      />
+      <Typography variant="h5" sx={{mb: 3}}>
+        Evaluation Metrics
+      </Typography>
+      <Box sx={{display: "flex", flexDirection: "column", gap: 4, mb: 3}}>
+        {metrics.map((metric, index) => (
+          <Box key={index}>
+            <Typography variant="h6" sx={{mb: 1, fontWeight: 600}}>
+              {metric.title}
+            </Typography>
+            <Typography variant="body1" color="text.secondary" component="div">
+              {metric.description}
+            </Typography>
+          </Box>
+        ))}
+      </Box>
+      <Typography variant="h5" sx={{mb: 3}}>
+        Evaluation Infrastructure
+      </Typography>
+      <Box sx={{mb: 4}}>
+        <Typography variant="body1" sx={{mb: 2}}>
+          All model evaluations are performed using the <Link href="https://github.com/vllm-project/vllm"
+                                                              target="_blank" rel="noopener">vLLM library</Link> with
+          4-bit quantization.
+          This approach allows us to efficiently run evaluations on multiple models while maintaining reasonable
+          inference speed and accuracy.
+        </Typography>
+      </Box>
+      <Typography variant="h5" sx={{ mb: 3 }}>
+        Additional Resources
+      </Typography>
+      <Box sx={{mb: 4}}>
+        <Typography variant="body1">
+          For complete transparency, we provide access to our <Link
+            href="https://huggingface.co/datasets/stacklok/llm-security-leaderboard-data" target="_blank"
+            rel="noopener">full dataset</Link> containing
+          all packages, CVEs, and code samples used in these evaluations.
+          You can also explore the <Link
+            href="https://huggingface.co/datasets/stacklok/llm-security-leaderboard-contents" target="_blank"
+            rel="noopener">detailed evaluation results</Link> which
+          include the exact prompts and responses from each model.
+        </Typography>
+      </Box>
+    </Box>
+  );
+}
+export default MethodologyPage;