diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..08d8d7a02ff4b4ba320292c8ac8841ce1eefcf97 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +node_modules +.next +.git +.gitignore +Dockerfile +README.md +out +.env +.vscode +.idea +*.log diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5caf9aca7086aa7a1a7b34d328b1ebf6c1021be3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. + +*.DS_Store + +# dependencies +/node_modules + +# next.js +/.next/ +/out/ + +# production +/build +/out + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files +.env* + +# vercel +.vercel + +# typescript +*.tsbuildinfo +next-env.d.ts diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b2a62b080b8f4fb4ff36cbca9f4adde59efec858 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,38 @@ +## Multi-stage Dockerfile for Next.js app (suitable for Hugging Face Spaces Docker runtime) +# - Builder stage installs deps and builds the Next app +# - Runner stage copies build artifacts and runs `npm run start` on $PORT (default 3000) + +FROM node:18-bullseye-slim AS builder +WORKDIR /app + +# install build deps and copy package files first for caching +COPY package*.json ./ +RUN npm ci --silent + +# copy source and build +COPY . ./ +RUN npm run build + +FROM node:18-bullseye-slim AS runner +WORKDIR /app + +ENV NODE_ENV=production +ENV PORT=3000 + +# minimal packages for certificates (if needed by model download / https) +RUN apt-get update && apt-get install -y ca-certificates --no-install-recommends && rm -rf /var/lib/apt/lists/* + +# copy runtime artifacts from builder +COPY --from=builder /app/package*.json ./ +COPY --from=builder /app/node_modules ./node_modules +COPY --from=builder /app/.next ./.next +COPY --from=builder /app/public ./public +COPY --from=builder /app/next.config.js ./next.config.js + +# Expose the port the app will run on (Spaces expects the app to listen on this port) +EXPOSE 3000 + +# If you use private/gated HF models, set HF_TOKEN in the Space secrets and expose here +# e.g. in Space settings: add secret HF_TOKEN with your token + +CMD ["npm", "run", "start"] diff --git a/README.md b/README.md index bbed83a677923ed2bc2b3ad68e90da940c5113b9..de6d2054db07f077bb158de6516395083d27c86a 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,95 @@ --- -title: General Eval Card -emoji: 📚 -colorFrom: gray -colorTo: gray +title: AI Evaluation Dashboard +emoji: 📊 +colorFrom: blue +colorTo: indigo sdk: docker pinned: false +app_port: 3000 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# AI Evaluation Dashboard + +This repository is a Next.js application for viewing and authoring AI evaluations. It includes demo evaluation fixtures under `public/evaluations/` and a dynamic details page that performs server-side rendering and route-handler based inference. + +## Run locally + +Install dependencies and run the dev server: + +```bash +npm ci +npm run dev +``` + +Build for production and run: + +```bash +npm ci +npm run build +NODE_ENV=production PORT=3000 npm run start +``` + +## Docker (recommended for Hugging Face Spaces) + +A `Dockerfile` is included for deploying this app as a dynamic service on Hugging Face Spaces (Docker runtime). + +Build the image locally: + +```bash +docker build -t ai-eval-dashboard . +``` + +Run the container (expose port 3000): + +```bash +docker run -p 3000:3000 -e HF_TOKEN="$HF_TOKEN" ai-eval-dashboard +``` + +Visit `http://localhost:3000` to verify. + +### Deploy to Hugging Face Spaces + +1. Create a new Space at https://huggingface.co/new-space and choose **Docker** as the runtime. +2. Add a secret named `HF_TOKEN` (if you plan to access private or gated models or the Inference API) in the Space settings. +3. Push this repository to the Space Git (or upload files through the UI). The Space will build the Docker image using the included `Dockerfile` and serve your app on port 3000. + +Notes: +- The app's server may attempt to construct ML pipelines server-side if you use Transformers.js and large models; prefer small/quantized models or use the Hugging Face Inference API instead (see below). +- If your build needs native dependencies (e.g. `sharp`), the Docker image may require extra apt packages; update the Dockerfile accordingly. + +## Alternative: Use Hugging Face Inference API (avoid hosting model weights) + +If downloading and running model weights inside the Space is impractical (memory/disk limits), modify the server route to proxy requests to the Hugging Face Inference API. + +Example server-side call (Route Handler): + +```js +const resp = await fetch('https://api-inference.huggingface.co/models/', { + method: 'POST', + headers: { Authorization: `Bearer ${process.env.HF_TOKEN}`, 'Content-Type': 'application/json' }, + body: JSON.stringify({ inputs: text }) +}) +const json = await resp.json() +``` + +Store `HF_TOKEN` in the Space secrets and your route will be able to call the API. + +## Troubleshooting + +- Build fails in Spaces: check the build logs; you may need extra apt packages or to pin Node version. +- Runtime OOM / killed: model is too large for Spaces; use Inference API or smaller models. + +## What I added + +- `Dockerfile` — multi-stage build for production +- `.dockerignore` — to reduce image size +- Updated `README.md` with Spaces frontmatter and deployment instructions + +If you want, I can: +- Modify the Dockerfile to use Next.js standalone mode for a smaller runtime image. +- Add a small health-check route and a simple `docker-compose.yml` for local testing. + +Which of those would you like next? +npm run build + +Send the contents of the "out" folder to https://huggingface.co/spaces/evaleval/general-eval-card diff --git a/app/evaluation/[id]/generateStaticParams.ts b/app/evaluation/[id]/generateStaticParams.ts new file mode 100644 index 0000000000000000000000000000000000000000..ab9206aceb56217e3231af3b66eddd9fce56dd87 --- /dev/null +++ b/app/evaluation/[id]/generateStaticParams.ts @@ -0,0 +1,15 @@ +import fs from "fs"; +import path from "path"; + +export async function generateStaticParams() { + const evaluationsDir = path.join(process.cwd(), "public/evaluations"); + const files = fs.readdirSync(evaluationsDir); + + const params = files.map((file) => { + const filePath = path.join(evaluationsDir, file); + const data = JSON.parse(fs.readFileSync(filePath, "utf-8")); + return { id: data.id }; + }); + + return params; +} diff --git a/app/evaluation/[id]/page.client.tsx b/app/evaluation/[id]/page.client.tsx new file mode 100644 index 0000000000000000000000000000000000000000..763643d197568dc6acfc780548aae459ffbfccc9 --- /dev/null +++ b/app/evaluation/[id]/page.client.tsx @@ -0,0 +1,711 @@ +"use client" + +import { useParams, useRouter } from "next/navigation" +import { useState, useEffect } from "react" +import { Button } from "@/components/ui/button" +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card" +import { Badge } from "@/components/ui/badge" +import { Checkbox } from "@/components/ui/checkbox" +import { ArrowLeft, Download } from "lucide-react" +import { CATEGORIES } from "@/lib/category-data" +import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS } from "@/lib/category-data" + +const loadEvaluationDetails = async (id: string) => { + const evaluationFiles = [ + "/evaluations/gpt-4-turbo.json", + "/evaluations/claude-3-sonnet.json", + "/evaluations/gemini-pro.json", + "/evaluations/fraud-detector.json", + ] + + for (const file of evaluationFiles) { + try { + const response = await fetch(file) + const data = await response.json() + + if (data.id === id) { + return data + } + } catch (error) { + console.error(`Failed to load evaluation data from ${file}:`, error) + } + } + + return null +} + +export default function EvaluationDetailsPage() { + const params = useParams() + const router = useRouter() + const evaluationId = params.id as string + + const [evaluation, setEvaluation] = useState(null) + const [loading, setLoading] = useState(true) + const [expandedAreas, setExpandedAreas] = useState>({}) + const toggleArea = (area: string) => setExpandedAreas((p) => ({ ...p, [area]: !p[area] })) + const [expandedNegatives, setExpandedNegatives] = useState>({}) + const toggleNegatives = (key: string) => setExpandedNegatives((p) => ({ ...p, [key]: !p[key] })) + const [visibleCategories, setVisibleCategories] = useState>({}) + const toggleCategoryVisibility = (id: string) => setVisibleCategories((p) => ({ ...p, [id]: !p[id] })) + const selectAll = () => { + const map: Record = {} + ;(evaluation.selectedCategories || []).forEach((id: string) => (map[id] = true)) + setVisibleCategories(map) + } + const deselectAll = () => { + const map: Record = {} + ;(evaluation.selectedCategories || []).forEach((id: string) => (map[id] = false)) + setVisibleCategories(map) + } + + // Persist visibility in localStorage per evaluation + const STORAGE_KEY = `eval:${evaluationId}:visibleCategories` + useEffect(() => { + try { + const raw = localStorage.getItem(STORAGE_KEY) + if (raw) { + const parsed = JSON.parse(raw) + setVisibleCategories(parsed) + return + } + } catch (e) { + // ignore + } + + // if nothing saved, initialize defaults (visible) + if (evaluation?.selectedCategories) { + const init: Record = {} + evaluation.selectedCategories.forEach((id: string) => { + init[id] = true + }) + setVisibleCategories((p) => ({ ...init, ...p })) + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [evaluationId, evaluation?.selectedCategories]) + + useEffect(() => { + try { + localStorage.setItem(STORAGE_KEY, JSON.stringify(visibleCategories)) + } catch (e) { + // ignore + } + }, [visibleCategories, evaluationId]) + + useEffect(() => { + const loadData = async () => { + const data = await loadEvaluationDetails(evaluationId) + setEvaluation(data) + setLoading(false) + } + loadData() + }, [evaluationId]) + + if (loading) { + return ( +
+
+
+

Loading evaluation details...

+
+
+ ) + } + + if (!evaluation) { + return ( +
+
+

Evaluation Not Found

+ +
+
+ ) + } + + return ( +
+
+
+ + +
+ +
+

{evaluation.systemName}

+

{evaluation.provider}

+
+
+ + {/* System Information */} + + + System Information + + +
+

System Version

+

{evaluation.version}

+
+
+

Deployment Context

+

{evaluation.deploymentContext}

+
+
+

Evaluation Date

+

{evaluation.evaluationDate}

+
+
+

Evaluator

+

{evaluation.evaluator}

+
+
+

Modality

+

{evaluation.modality}

+
+
+

Completeness Score

+

{evaluation.overallStats?.completenessScore || "N/A"}%

+
+
+
+ + {/* Applicable Categories - split into Capabilities & Risks with visibility toggles */} + + +
+ Applicable Categories ({evaluation.selectedCategories?.length || 0}) +
+ + +
+
+
+ +
+
+
Capabilities
+
+ {evaluation.selectedCategories + ?.map((id: string) => CATEGORIES.find((c) => c.id === id)) + .filter(Boolean) + .filter((c: any) => c.type === "capability") + .map((category: any) => ( + + ))} +
+
+ +
+
Risks
+
+ {evaluation.selectedCategories + ?.map((id: string) => CATEGORIES.find((c) => c.id === id)) + .filter(Boolean) + .filter((c: any) => c.type === "risk") + .map((category: any) => ( + + ))} +
+
+
+
+
+ + {/* Overall Statistics */} + + + Overall Statistics + + +
+
+
+ {evaluation.overallStats?.strongCategories?.length || 0} +
+
Strong
+
+
+
+ {evaluation.overallStats?.adequateCategories?.length || 0} +
+
Adequate
+
+
+
+ {evaluation.overallStats?.weakCategories?.length || 0} +
+
Weak
+
+
+
+ {evaluation.overallStats?.insufficientCategories?.length || 0} +
+
Insufficient
+
+
+
+
+ + {/* Priority Areas (show only weak/insufficient like results) */} + {((evaluation.overallStats?.weakCategories || []).length > 0 || (evaluation.overallStats?.insufficientCategories || []).length > 0) && ( + + + Priority Areas + + +
+ {[...(evaluation.overallStats?.insufficientCategories || []), ...(evaluation.overallStats?.weakCategories || [])] + .filter(Boolean) + .map((catId: string) => { + const category = CATEGORIES.find((c) => c.id === catId) + return ( +
+
+
{category?.name || catId}
+
{category?.description}
+
+ + {evaluation.overallStats?.insufficientCategories?.includes(catId) ? "insufficient" : "weak"} + +
+ ) + })} +
+
+
+ )} + + {/* Evaluation Details */} + {evaluation.categoryEvaluations && + Object.entries(evaluation.categoryEvaluations) + .filter(([categoryId]) => visibleCategories[categoryId] ?? true) + .map(([categoryId, data]: [string, any]) => { + const category = CATEGORIES.find((c) => c.id === categoryId) + + // compute per-category score (yes out of applicable (yes+no)) across A & B + const benchmarkQs = BENCHMARK_QUESTIONS.map((q) => q.id) + const processQs = PROCESS_QUESTIONS.map((q) => q.id) + let yesCount = 0 + let noCount = 0 + let naCount = 0 + + for (const qid of benchmarkQs) { + const raw = data.benchmarkAnswers?.[qid] + const answers = Array.isArray(raw) ? raw : raw ? [raw] : [] + const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes") + const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no") + const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a") + if (hasYes) yesCount++ + else if (hasNo) noCount++ + else if (hasNA) naCount++ + else naCount++ + } + + for (const qid of processQs) { + const raw = data.processAnswers?.[qid] + const answers = Array.isArray(raw) ? raw : raw ? [raw] : [] + const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes") + const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no") + const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a") + if (hasYes) yesCount++ + else if (hasNo) noCount++ + else if (hasNA) naCount++ + else naCount++ + } + + const totalApplicable = yesCount + noCount + const scoreText = totalApplicable > 0 ? `${yesCount}/${totalApplicable}` : "N/A" + let rating = "Unknown" + if (evaluation.overallStats?.strongCategories?.includes(categoryId)) rating = "Strong" + else if (evaluation.overallStats?.adequateCategories?.includes(categoryId)) rating = "Adequate" + else if (evaluation.overallStats?.weakCategories?.includes(categoryId)) rating = "Weak" + else if (evaluation.overallStats?.insufficientCategories?.includes(categoryId)) rating = "Insufficient" + + const ratingClass = + rating === "Strong" + ? "inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-green-100 text-green-700" + : rating === "Adequate" + ? "inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-blue-100 text-blue-700" + : rating === "Weak" + ? "inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-yellow-100 text-yellow-800" + : "inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-red-100 text-red-700" + + return ( + + +
+
+ + {category?.name || categoryId} + + {category?.type || "unknown"} + + +

{category?.description}

+
+ +
+
Score
+
{scoreText}
+
+ {rating} +
+
+
+
+ + {/* Benchmark Questions */} + {data.benchmarkSources && ( +
+

Part A: Benchmark & Testing

+
+ {(() => { + const entries = Object.entries(data.benchmarkSources || {}) as [string, any][] + const yesItems: any[] = [] + const noItems: any[] = [] + const naItems: any[] = [] + + // iterate the union of known source keys and answer keys so we show questions + const canonicalKeys = BENCHMARK_QUESTIONS.map((q) => q.id) + const answerKeys = Object.keys(data.benchmarkAnswers || {}) + const sourceKeys = Object.keys(data.benchmarkSources || {}) + const keySet = new Set([...canonicalKeys, ...answerKeys, ...sourceKeys]) + for (const questionId of Array.from(keySet)) { + const sources = data.benchmarkSources?.[questionId] || [] + const qText = BENCHMARK_QUESTIONS.find((x) => x.id === questionId)?.text || questionId + const rawAnswer = data.benchmarkAnswers?.[questionId] + const answers = Array.isArray(rawAnswer) ? rawAnswer : rawAnswer ? [rawAnswer] : [] + const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes") + const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no") + const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a") + + const reason = + sources?.[0]?.scope || sources?.[0]?.description || data.additionalAspects || (hasNA ? "Not applicable" : undefined) + + if (hasYes) yesItems.push({ questionId, qText, sources }) + else if (hasNo) noItems.push({ questionId, qText }) + else if (hasNA) naItems.push({ questionId, qText, reason }) + else naItems.push({ questionId, qText, reason: reason || "Not applicable" }) + } + + return ( + <> + {yesItems.map((it) => { + const key = `bench-${categoryId}-${it.questionId}` + return ( +
+
toggleNegatives(key)} + className="flex items-center gap-2 mb-2 justify-between cursor-pointer" + > +
+ {it.questionId}: +
{it.qText}
+
+
+ yes +
+
+ + {expandedNegatives[key] && (() => { + const cards = (it.sources || []).flatMap((src: any) => { + const names = String(src.benchmarkName || '') + .split(',') + .map((s: string) => s.trim()) + .filter(Boolean) + + const scoreParts = String(src.score || '') + .split(',') + .map((s: string) => s.trim()) + .filter(Boolean) + + return (names.length > 0 ? names : ['Benchmark']).map((name: string, idx: number) => { + // determine score for this benchmark (positional or by name) or fallback to any numeric + let scoreNum: number | undefined + if (scoreParts.length === names.length && scoreParts[idx]) { + const m = scoreParts[idx].match(/(\d+(?:\.\d+)?)/) + if (m) scoreNum = parseFloat(m[1]) + } else if (scoreParts.length > 0) { + const byName = scoreParts.find((p: string) => p.toLowerCase().includes(name.toLowerCase())) + const m = (byName || scoreParts[0]).match(/(\d+(?:\.\d+)?)/) + if (m) scoreNum = parseFloat(m[1]) + } else if (src?.score) { + const m = String(src.score).match(/(\d+(?:\.\d+)?)/) + if (m) scoreNum = parseFloat(m[1]) + } + + return ( +
+
+
Percentage
+
{scoreNum != null ? `${scoreNum}%` : '—'}
+
+ +
{name}
+ + {scoreNum != null && ( +
+
+
+
+
+ )} + +
+
+ Source:{' '} + {src.url ? ( + + {src.url} + + ) : ( + '—' + )} +
+
+ Type: {src.sourceType || src.documentType || '—'} +
+ {src.metrics && ( +
+ Metric: {src.metrics} +
+ )} + {src.confidenceInterval && ( +
+ Confidence Interval: {src.confidenceInterval} +
+ )} + {src.description && ( +
{src.description}
+ )} +
+
+ ) + }) + }) + + if (cards.length === 0) return
No benchmark details available.
+ + return
{cards}
+ })()} +
+ ) + })} + + {noItems.map((it) => ( +
+
+
+ {it.questionId}: +
{it.qText}
+
+
+ no +
+
+
+ ))} + + {naItems.length > 0 && ( +
+
+
Not applicable ({naItems.length})
+ +
+ + {expandedNegatives[`bench-na-${categoryId}`] && ( +
+ {naItems.map((it) => ( +
+
+
+ {it.questionId}: {it.qText} +
+
Reason: {it.reason}
+
+
+ ))} +
+ )} +
+ )} + + ) + })()} +
+
+ )} + + {/* Process Questions */} + {data.processSources && ( +
+

Part B: Documentation & Process

+
+ {(() => { + const entries = Object.entries(data.processSources || {}) as [string, any][] + const yesItems: any[] = [] + const noItems: any[] = [] + const naItems: any[] = [] + + const canonicalKeys = PROCESS_QUESTIONS.map((q) => q.id) + const answerKeys = Object.keys(data.processAnswers || {}) + const sourceKeys = Object.keys(data.processSources || {}) + const keySet = new Set([...canonicalKeys, ...answerKeys, ...sourceKeys]) + for (const questionId of Array.from(keySet)) { + const sources = data.processSources?.[questionId] || [] + const qText = PROCESS_QUESTIONS.find((x) => x.id === questionId)?.text || questionId + const rawAnswer = data.processAnswers?.[questionId] + const answers = Array.isArray(rawAnswer) ? rawAnswer : rawAnswer ? [rawAnswer] : [] + const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes") + const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no") + const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a") + + const reason = sources?.[0]?.scope || sources?.[0]?.description || data.additionalAspects || (hasNA ? "Not applicable" : undefined) + + if (hasYes) yesItems.push({ questionId, qText, sources }) + else if (hasNo) noItems.push({ questionId, qText }) + else if (hasNA) naItems.push({ questionId, qText, reason }) + else naItems.push({ questionId, qText, reason: reason || "Not applicable" }) + } + + return ( + <> + {yesItems.map((it) => { + const key = `proc-${categoryId}-${it.questionId}` + return ( +
+
toggleNegatives(key)} + className="flex items-center gap-2 mb-2 justify-between cursor-pointer" + > +
+ {it.questionId}: +
{it.qText}
+
+
+ yes +
+
+ + {expandedNegatives[key] && ( +
+ {(it.sources || []).map((src: any, i: number) => ( +
+
+
+ URL: {src?.url || '—'} +
+
+ Document Type: {src?.documentType || src?.sourceType || '—'} +
+
+ {src?.description && ( +
+ Description: {src.description} +
+ )} +
+ ))} +
+ )} +
+ ) + })} + + {noItems.map((it) => ( +
+
+
+ {it.questionId}: +
{it.qText}
+
+
+ no +
+
+
+ ))} + + {naItems.length > 0 && ( +
+
+
Not applicable ({naItems.length})
+ +
+ + {expandedNegatives[`proc-na-${categoryId}`] && ( +
+ {naItems.map((it) => ( +
+
+
+ {it.questionId}: {it.qText} +
+
Reason: {it.reason}
+
+
+ ))} +
+ )} +
+ )} + + ) + })()} +
+
+ )} + + {/* Additional Aspects */} + {data.additionalAspects && ( +
+

Part C: Additional Aspects

+
+

{data.additionalAspects}

+
+
+ )} + + + ) + })} +
+ ) +} diff --git a/app/evaluation/[id]/page.tsx b/app/evaluation/[id]/page.tsx new file mode 100644 index 0000000000000000000000000000000000000000..8876d97b0554264793c9a398b9bcbb7d4fde6368 --- /dev/null +++ b/app/evaluation/[id]/page.tsx @@ -0,0 +1,9 @@ +export { generateStaticParams } from "./generateStaticParams" + +import ClientPage from "./page.client" + +export default function PageWrapper() { + return +} + + diff --git a/app/evaluation/[id]/server.ts b/app/evaluation/[id]/server.ts new file mode 100644 index 0000000000000000000000000000000000000000..3ce12956be4f5f5df2169a36b987e762cda34ede --- /dev/null +++ b/app/evaluation/[id]/server.ts @@ -0,0 +1 @@ +export { generateStaticParams } from "./generateStaticParams"; diff --git a/app/globals.css b/app/globals.css new file mode 100644 index 0000000000000000000000000000000000000000..fed21fe766883d88e99495cf7e469ad0ef3d398c --- /dev/null +++ b/app/globals.css @@ -0,0 +1,127 @@ +@import "tailwindcss"; +@import "tw-animate-css"; + +@custom-variant dark (&:is(.dark *)); + +:root { + --background: oklch(1 0 0); /* #ffffff - Clean white background */ + --foreground: oklch(0.205 0 0); /* #1f2937 - Dark gray for main text */ + --card: oklch(0.97 0 0); /* #f1f5f9 - Light gray for cards */ + --card-foreground: oklch(0.439 0 0); /* #6b7280 - Mid-tone gray for card text */ + --popover: oklch(1 0 0); /* #ffffff - White for popovers */ + --popover-foreground: oklch(0.205 0 0); /* #1f2937 - Dark gray for popover text */ + --primary: oklch(0.205 0 0); /* #1f2937 - Primary dark gray */ + --primary-foreground: oklch(1 0 0); /* #ffffff - White text on primary */ + --secondary: oklch(0.646 0.222 280.116); /* #8b5cf6 - Purple accent */ + --secondary-foreground: oklch(1 0 0); /* #ffffff - White text on accent */ + --muted: oklch(0.97 0 0); /* #f1f5f9 - Muted light gray */ + --muted-foreground: oklch(0.439 0 0); /* #6b7280 - Muted text color */ + --accent: oklch(0.646 0.222 280.116); /* #8b5cf6 - Purple for interactive elements */ + --accent-foreground: oklch(1 0 0); /* #ffffff - White text on accent */ + --destructive: oklch(0.577 0.245 27.325); /* #dc2626 - Red for errors */ + --destructive-foreground: oklch(1 0 0); /* #ffffff - White text on destructive */ + --border: oklch(0.922 0 0); /* #e5e7eb - Light gray borders */ + --input: oklch(0.985 0 0); /* #f9fafb - Very light gray for inputs */ + --ring: oklch(0.646 0.222 280.116 / 0.5); /* Purple focus ring with opacity */ + --chart-1: oklch(0.488 0.243 264.376); /* #4f46e5 - Indigo for charts */ + --chart-2: oklch(0.6 0.118 184.704); /* #3b82f6 - Blue for charts */ + --chart-3: oklch(0.696 0.17 162.48); /* #22c55e - Green for charts */ + --chart-4: oklch(0.828 0.189 84.429); /* #fbbf24 - Yellow for charts */ + --chart-5: oklch(0.627 0.265 303.9); /* #ef4444 - Red for charts */ + --radius: 0.5rem; /* Consistent border radius */ + --sidebar: oklch(0.97 0 0); /* #f1f5f9 - Light gray sidebar */ + --sidebar-foreground: oklch(0.205 0 0); /* #1f2937 - Dark gray sidebar text */ + --sidebar-primary: oklch(0.205 0 0); /* #1f2937 - Primary sidebar color */ + --sidebar-primary-foreground: oklch(1 0 0); /* #ffffff - White text on sidebar primary */ + --sidebar-accent: oklch(0.646 0.222 280.116); /* #8b5cf6 - Purple sidebar accent */ + --sidebar-accent-foreground: oklch(1 0 0); /* #ffffff - White text on sidebar accent */ + --sidebar-border: oklch(0.922 0 0); /* #e5e7eb - Light gray sidebar borders */ + --sidebar-ring: oklch(0.646 0.222 280.116 / 0.5); /* Purple sidebar focus ring */ + --font-heading: var(--font-space-grotesk); + --font-sans: var(--font-dm-sans); +} + +.dark { + --background: oklch(0.145 0 0); + --foreground: oklch(0.985 0 0); + --card: oklch(0.145 0 0); + --card-foreground: oklch(0.985 0 0); + --popover: oklch(0.145 0 0); + --popover-foreground: oklch(0.985 0 0); + --primary: oklch(0.985 0 0); + --primary-foreground: oklch(0.205 0 0); + --secondary: oklch(0.269 0 0); + --secondary-foreground: oklch(0.985 0 0); + --muted: oklch(0.269 0 0); + --muted-foreground: oklch(0.708 0 0); + --accent: oklch(0.269 0 0); + --accent-foreground: oklch(0.985 0 0); + --destructive: oklch(0.396 0.141 25.723); + --destructive-foreground: oklch(0.637 0.237 25.331); + --border: oklch(0.269 0 0); + --input: oklch(0.269 0 0); + --ring: oklch(0.439 0 0); + --chart-1: oklch(0.488 0.243 264.376); + --chart-2: oklch(0.696 0.17 162.48); + --chart-3: oklch(0.769 0.188 70.08); + --chart-4: oklch(0.627 0.265 303.9); + --chart-5: oklch(0.645 0.246 16.439); + --sidebar: oklch(0.205 0 0); + --sidebar-foreground: oklch(0.985 0 0); + --sidebar-primary: oklch(0.488 0.243 264.376); + --sidebar-primary-foreground: oklch(0.985 0 0); + --sidebar-accent: oklch(0.269 0 0); + --sidebar-accent-foreground: oklch(0.985 0 0); + --sidebar-border: oklch(0.269 0 0); + --sidebar-ring: oklch(0.439 0 0); +} + +@theme inline { + --color-background: var(--background); + --color-foreground: var(--foreground); + --color-card: var(--card); + --color-card-foreground: var(--card-foreground); + --color-popover: var(--popover); + --color-popover-foreground: var(--popover-foreground); + --color-primary: var(--primary); + --color-primary-foreground: var(--primary-foreground); + --color-secondary: var(--secondary); + --color-secondary-foreground: var(--secondary-foreground); + --color-muted: var(--muted); + --color-muted-foreground: var(--muted-foreground); + --color-accent: var(--accent); + --color-accent-foreground: var(--accent-foreground); + --color-destructive: var(--destructive); + --color-destructive-foreground: var(--destructive-foreground); + --color-border: var(--border); + --color-input: var(--input); + --color-ring: var(--ring); + --color-chart-1: var(--chart-1); + --color-chart-2: var(--chart-2); + --color-chart-3: var(--chart-3); + --color-chart-4: var(--chart-4); + --color-chart-5: var(--chart-5); + --radius-sm: calc(var(--radius) - 4px); + --radius-md: calc(var(--radius) - 2px); + --radius-lg: var(--radius); + --radius-xl: calc(var(--radius) + 4px); + --color-sidebar: var(--sidebar); + --color-sidebar-foreground: var(--sidebar-foreground); + --color-sidebar-primary: var(--sidebar-primary); + --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); + --color-sidebar-accent: var(--sidebar-accent); + --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); + --color-sidebar-border: var(--sidebar-border); + --color-sidebar-ring: var(--sidebar-ring); + --font-heading: var(--font-space-grotesk); + --font-sans: var(--font-dm-sans); +} + +@layer base { + * { + @apply border-border outline-ring/50; + } + body { + @apply bg-background text-foreground; + } +} diff --git a/app/layout.tsx b/app/layout.tsx new file mode 100644 index 0000000000000000000000000000000000000000..ddd3e4bda713343c3baf0906208bd4f4770e07d2 --- /dev/null +++ b/app/layout.tsx @@ -0,0 +1,39 @@ +import type React from "react" +import type { Metadata } from "next" +import { Space_Grotesk, DM_Sans } from "next/font/google" +import "./globals.css" +import { ThemeProvider } from "@/components/theme-provider" + +const spaceGrotesk = Space_Grotesk({ + subsets: ["latin"], + display: "swap", + variable: "--font-space-grotesk", +}) + +const dmSans = DM_Sans({ + subsets: ["latin"], + display: "swap", + variable: "--font-dm-sans", +}) + +export const metadata: Metadata = { + title: "AI Evaluation Dashboard", + description: "Professional AI system evaluation and assessment tool", + generator: "v0.app", +} + +export default function RootLayout({ + children, +}: Readonly<{ + children: React.ReactNode +}>) { + return ( + + + + {children} + + + + ) +} diff --git a/app/page.tsx b/app/page.tsx new file mode 100644 index 0000000000000000000000000000000000000000..1f04a5ff0541bd3f16af9ed89fd3f2ce8ed00c2b --- /dev/null +++ b/app/page.tsx @@ -0,0 +1,566 @@ +"use client" + +import { useState, useMemo, useEffect } from "react" +import { Button } from "@/components/ui/button" +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select" +import { Plus, Moon, Sun, Filter, ArrowUpDown } from "lucide-react" +import { useTheme } from "next-themes" +import { EvaluationCard, type EvaluationCardData } from "@/components/evaluation-card" +import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS } from "@/lib/category-data" +import { AIEvaluationDashboard } from "@/components/ai-evaluation-dashboard" + +const loadEvaluationData = async (): Promise => { + const evaluationFiles = [ + "/evaluations/gpt-4-turbo.json", + "/evaluations/claude-3-sonnet.json", + "/evaluations/gemini-pro.json", + "/evaluations/fraud-detector.json", + ] + + const additionalFiles = [] + for (let i = 1; i <= 10; i++) { + additionalFiles.push(`/evaluations/eval-${Date.now() - i * 86400000}.json`) // Check for files from last 10 days + } + + const allFiles = [...evaluationFiles, ...additionalFiles] + const evaluations: EvaluationCardData[] = [] + + for (const file of allFiles) { + try { + const response = await fetch(file) + if (!response.ok) continue // Skip files that don't exist + + const data = await response.json() + + const cardData: EvaluationCardData = { + id: data.id || `eval-${Date.now()}`, + systemName: data.systemName || "Unknown System", + provider: data.provider || "Unknown Provider", + modality: data.modality || "Unknown", + completedDate: data.evaluationDate || new Date().toISOString().split("T")[0], + applicableCategories: data.overallStats?.totalApplicable || 0, + completedCategories: data.overallStats?.totalApplicable || 0, + status: + data.overallStats?.strongCategories?.length >= (data.overallStats?.adequateCategories?.length || 0) + ? "strong" + : data.overallStats?.adequateCategories?.length >= (data.overallStats?.weakCategories?.length || 0) + ? "adequate" + : "weak", + capabilityEval: { + strong: (data.overallStats?.strongCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ).length, + adequate: (data.overallStats?.adequateCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ).length, + weak: (data.overallStats?.weakCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ).length, + insufficient: (data.overallStats?.insufficientCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ).length, + strongCategories: (data.overallStats?.strongCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ), + adequateCategories: (data.overallStats?.adequateCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ), + weakCategories: (data.overallStats?.weakCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ), + insufficientCategories: (data.overallStats?.insufficientCategories || []).filter((cat: string) => + [ + "language-communication", + "social-intelligence", + "problem-solving", + "creativity-innovation", + "learning-memory", + "perception-vision", + "physical-manipulation", + "metacognition", + "robotic-intelligence", + ].includes(cat), + ), + totalApplicable: data.overallStats?.capabilityApplicable || 0, + }, + riskEval: { + strong: (data.overallStats?.strongCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ).length, + adequate: (data.overallStats?.adequateCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ).length, + weak: (data.overallStats?.weakCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ).length, + insufficient: (data.overallStats?.insufficientCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ).length, + strongCategories: (data.overallStats?.strongCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ), + adequateCategories: (data.overallStats?.adequateCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ), + weakCategories: (data.overallStats?.weakCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ), + insufficientCategories: (data.overallStats?.insufficientCategories || []).filter((cat: string) => + [ + "harmful-content", + "information-integrity", + "privacy-data", + "bias-fairness", + "security-robustness", + "dangerous-capabilities", + "human-ai-interaction", + "environmental-impact", + "economic-displacement", + "governance-accountability", + "value-chain", + ].includes(cat), + ), + totalApplicable: data.overallStats?.riskApplicable || 0, + }, + priorityAreas: data.overallStats?.priorityAreas || [], + priorityDetails: (() => { + // Build a richer structure: for each area, include yes questions and negative questions (no/na) with optional reason + const pd: Record< + string, + { + yes: string[] + negative: { text: string; status: "no" | "na"; reason?: string }[] + } + > = {} + const areas = data.overallStats?.priorityAreas || [] + for (const area of areas) { + const catEval = data.categoryEvaluations?.[area] + if (!catEval) continue + + const yesList: string[] = [] + const negList: { text: string; status: "no" | "na"; reason?: string }[] = [] + + // Helper to detect NA reason from category metadata + const naReasonFromMeta = (): string | undefined => { + if (typeof catEval.additionalAspects === "string" && /not applicable/i.test(catEval.additionalAspects)) { + return catEval.additionalAspects + } + // look into processSources scopes for any note + if (catEval.processSources) { + for (const entries of Object.values(catEval.processSources)) { + if (Array.isArray(entries)) { + for (const ent of entries as any[]) { + if (ent && typeof ent.scope === "string" && /not applicable/i.test(ent.scope)) { + return ent.scope + } + } + } + } + } + return undefined + } + + const naMeta = naReasonFromMeta() + + // check benchmarkAnswers (A1..A6) + if (catEval.benchmarkAnswers) { + for (const [qid, ans] of Object.entries(catEval.benchmarkAnswers)) { + const answer = ans + const isArray = Array.isArray(answer) + const negative = answer === "no" || (isArray && (answer as any[]).includes("no")) + const positive = answer === "yes" || (isArray && (answer as any[]).includes("yes")) + const qText = BENCHMARK_QUESTIONS.find((x) => x.id === qid)?.text || qid + if (positive) yesList.push(qText) + if (negative) { + const status = naMeta ? "na" : "no" + negList.push({ text: qText, status, reason: naMeta }) + } + } + } + + // check processAnswers (B1..B6) + if (catEval.processAnswers) { + for (const [qid, ans] of Object.entries(catEval.processAnswers)) { + const answer = ans + const isArray = Array.isArray(answer) + const negative = answer === "no" || (isArray && (answer as any[]).includes("no")) + const positive = answer === "yes" || (isArray && (answer as any[]).includes("yes")) + const qText = PROCESS_QUESTIONS.find((x) => x.id === qid)?.text || qid + if (positive) yesList.push(qText) + if (negative) { + const status = naMeta ? "na" : "no" + negList.push({ text: qText, status, reason: naMeta }) + } + } + } + + if (yesList.length || negList.length) pd[area] = { yes: yesList, negative: negList } + } + return pd + })(), + } + + evaluations.push(cardData) + } catch (error) { + continue + } + } + + return evaluations +} + +export default function HomePage() { + const { theme, setTheme } = useTheme() + const [showNewEvaluation, setShowNewEvaluation] = useState(false) + const [evaluationsData, setEvaluationsData] = useState([]) + const [loading, setLoading] = useState(true) + + useEffect(() => { + const loadData = async () => { + const data = await loadEvaluationData() + setEvaluationsData(data) + setLoading(false) + } + loadData() + }, []) + + const [sortBy, setSortBy] = useState<"date-newest" | "date-oldest">("date-newest") + const [filterByProvider, setFilterByProvider] = useState("all") + const [filterByModality, setFilterByModality] = useState("all") + + const uniqueProviders = useMemo(() => { + const providers = [...new Set(evaluationsData.map((item) => item.provider))].sort() + return providers + }, [evaluationsData]) + + const uniqueModalities = useMemo(() => { + const modalities = [...new Set(evaluationsData.map((item) => item.modality))].sort() + return modalities + }, [evaluationsData]) + + const filteredAndSortedEvaluations = useMemo(() => { + let filtered = evaluationsData + + if (filterByProvider !== "all") { + filtered = filtered.filter((item) => item.provider === filterByProvider) + } + + if (filterByModality !== "all") { + filtered = filtered.filter((item) => item.modality === filterByModality) + } + + filtered = [...filtered].sort((a, b) => { + const dateA = new Date(a.completedDate) + const dateB = new Date(b.completedDate) + + if (sortBy === "date-newest") { + return dateB.getTime() - dateA.getTime() + } else { + return dateA.getTime() - dateB.getTime() + } + }) + + return filtered + }, [evaluationsData, sortBy, filterByProvider, filterByModality]) + + const handleViewEvaluation = (id: string) => {} + + const handleDeleteEvaluation = (id: string) => { + setEvaluationsData((prev) => prev.filter((evaluation) => evaluation.id !== id)) + } + + const handleSaveEvaluation = (newEvaluation: EvaluationCardData) => { + setEvaluationsData((prev) => [newEvaluation, ...prev]) + } + + if (showNewEvaluation) { + return setShowNewEvaluation(false)} onSaveEvaluation={handleSaveEvaluation} /> + } + + if (loading) { + return ( +
+
+
+

Loading evaluations...

+
+
+ ) + } + + return ( +
+
+
+
+
+

AI Evaluation Dashboard

+

Manage and track your AI system evaluations

+
+
+ + +
+
+
+
+ +
+
+
+

Evaluation Cards

+

{filteredAndSortedEvaluations.length} eval cards

+
+ +
+
+ + Sort by: + +
+ +
+ + Provider: + +
+ +
+ + Modality: + +
+
+ + {filteredAndSortedEvaluations.length > 0 ? ( +
+ {filteredAndSortedEvaluations.map((evaluation) => ( + + ))} +
+ ) : ( +
+
+ +
+

No evaluations match your filters

+

Try adjusting your filter criteria to see more results

+ +
+ )} +
+
+
+ ) +} diff --git a/components.json b/components.json new file mode 100644 index 0000000000000000000000000000000000000000..335484f9424bf72b98e3b892275740bc8f014754 --- /dev/null +++ b/components.json @@ -0,0 +1,21 @@ +{ + "$schema": "https://ui.shadcn.com/schema.json", + "style": "new-york", + "rsc": true, + "tsx": true, + "tailwind": { + "config": "", + "css": "app/globals.css", + "baseColor": "neutral", + "cssVariables": true, + "prefix": "" + }, + "aliases": { + "components": "@/components", + "utils": "@/lib/utils", + "ui": "@/components/ui", + "lib": "@/lib", + "hooks": "@/hooks" + }, + "iconLibrary": "lucide" +} \ No newline at end of file diff --git a/components/ai-evaluation-dashboard.tsx b/components/ai-evaluation-dashboard.tsx new file mode 100644 index 0000000000000000000000000000000000000000..bf8932b14d50501b1f810c9d6037e9d785986086 --- /dev/null +++ b/components/ai-evaluation-dashboard.tsx @@ -0,0 +1,350 @@ +"use client" + +import { useState } from "react" +import { Progress } from "@/components/ui/progress" +import { Badge } from "@/components/ui/badge" +import { Button } from "@/components/ui/button" +import { ArrowLeft } from "lucide-react" +import { SystemInfoForm } from "./system-info-form" +import { CategorySelection } from "./category-selection" +import { CategoryEvaluation } from "./category-evaluation" +import { EvaluationForm } from "./evaluation-form" +import { ResultsDashboard } from "./results-dashboard" +import { CATEGORIES } from "@/lib/category-data" + +export type SystemInfo = { + name: string + url: string + provider: string + systemTypes: string[] + deploymentContexts: string[] + modality: string + modelTag?: string + knowledgeCutoff?: string + modelType?: "foundational" | "fine-tuned" | "na" + inputModalities?: string[] + outputModalities?: string[] +} + +export type CategoryScore = { + benchmarkScore: number + processScore: number + totalScore: number + status: "strong" | "adequate" | "weak" | "insufficient" | "not-evaluated" + // optional metadata + totalQuestions?: number + totalApplicable?: number + naCount?: number +} + +export type EvaluationData = { + systemInfo: SystemInfo | null + selectedCategories: string[] + excludedCategoryReasons?: Record + categoryScores: Record + currentCategory: string | null +} + +interface AIEvaluationDashboardProps { + onBack?: () => void + onSaveEvaluation?: (evaluation: any) => void +} + +export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluationDashboardProps) { + const [currentStep, setCurrentStep] = useState<"system-info" | "categories" | "evaluation" | "results">("system-info") + const [currentCategoryIndex, setCurrentCategoryIndex] = useState(0) + const [evaluationData, setEvaluationData] = useState({ + systemInfo: null, + selectedCategories: [], + categoryScores: {}, + currentCategory: null, + }) + + const steps = [ + { id: "system-info", label: "System Info", number: 1 }, + { id: "categories", label: "Categories", number: 2 }, + { id: "evaluation", label: "Evaluation", number: 3 }, + { id: "results", label: "Results", number: 4 }, + ] + + const getOverallProgress = () => { + if (currentStep === "system-info") return 10 + if (currentStep === "categories") return 25 + if (currentStep === "evaluation") { + const completed = Object.keys(evaluationData.categoryScores).length + const total = evaluationData.selectedCategories.length + return total > 0 ? 25 + (completed / total) * 65 : 25 + } + return 100 + } + + const handleSystemInfoComplete = (systemInfo: SystemInfo) => { + setEvaluationData((prev) => ({ ...prev, systemInfo })) + setCurrentStep("categories") + } + + const handleCategoriesSelected = (categories: string[]) => { + setEvaluationData((prev) => ({ ...prev, selectedCategories: categories })) + setCurrentCategoryIndex(0) + setCurrentStep("evaluation") + } + + const handleCategoriesSelectedWithReasons = (categories: string[], excludedReasons: Record) => { + setEvaluationData((prev) => ({ ...prev, selectedCategories: categories, excludedCategoryReasons: excludedReasons })) + setCurrentCategoryIndex(0) + setCurrentStep("evaluation") + } + + const handleCategoryComplete = (categoryId: string, score: CategoryScore) => { + console.log("[v0] handleCategoryComplete called with:", { categoryId, score }) + + setEvaluationData((prev) => { + const newCategoryScores = { ...prev.categoryScores, [categoryId]: score } + console.log("[v0] Updated categoryScores:", newCategoryScores) + return { + ...prev, + categoryScores: newCategoryScores, + } + }) + + const nextIndex = currentCategoryIndex + 1 + console.log( + "[v0] Current index:", + currentCategoryIndex, + "Next index:", + nextIndex, + "Total categories:", + evaluationData.selectedCategories.length, + ) + + if (nextIndex >= evaluationData.selectedCategories.length) { + console.log("[v0] All categories complete, moving to results") + setCurrentStep("results") + } else { + console.log("[v0] Moving to next category at index:", nextIndex) + setCurrentCategoryIndex(nextIndex) + } + } + + const handleSaveEvaluation = async () => { + console.log("[v0] handleSaveEvaluation called") + console.log("[v0] evaluationData:", evaluationData) + + if (!evaluationData.systemInfo || evaluationData.selectedCategories.length === 0) { + alert("Please complete system information and select categories before saving.") + return + } + + const timestamp = Date.now() + const evaluationId = `eval-${timestamp}` + console.log("[v0] Generated evaluationId:", evaluationId) + + console.log("[v0] Processing category scores:", evaluationData.categoryScores) + + const capabilityCategories = evaluationData.selectedCategories.filter((cat) => { + const category = CATEGORIES.find((c) => c.id === cat) + console.log("[v0] Category check:", cat, "type:", category?.type) + return category?.type === "capability" + }) + console.log("[v0] Capability categories:", capabilityCategories) + + const riskCategories = evaluationData.selectedCategories.filter((cat) => { + const category = CATEGORIES.find((c) => c.id === cat) + return category?.type === "risk" + }) + console.log("[v0] Risk categories:", riskCategories) + + const strongCategories = Object.entries(evaluationData.categoryScores) + .filter(([_, score]) => { + console.log("[v0] Checking score for strong:", score) + return score.status === "strong" + }) + .map(([catId]) => catId) + console.log("[v0] Strong categories:", strongCategories) + + const adequateCategories = Object.entries(evaluationData.categoryScores) + .filter(([_, score]) => score.status === "adequate") + .map(([catId]) => catId) + console.log("[v0] Adequate categories:", adequateCategories) + + const weakCategories = Object.entries(evaluationData.categoryScores) + .filter(([_, score]) => score.status === "weak") + .map(([catId]) => catId) + console.log("[v0] Weak categories:", weakCategories) + + const insufficientCategories = Object.entries(evaluationData.categoryScores) + .filter(([_, score]) => score.status === "insufficient") + .map(([catId]) => catId) + console.log("[v0] Insufficient categories:", insufficientCategories) + + const evaluationJson = { + id: evaluationId, + systemName: evaluationData.systemInfo.name, + provider: evaluationData.systemInfo.provider, + version: evaluationData.systemInfo.url || "1.0", + deploymentContext: evaluationData.systemInfo.deploymentContexts.join(", ") || "Production", + evaluator: "Current User", + modality: evaluationData.systemInfo.modality, + evaluationDate: new Date().toISOString().split("T")[0], + selectedCategories: evaluationData.selectedCategories, + excludedCategoryReasons: evaluationData.excludedCategoryReasons || {}, + categoryEvaluations: evaluationData.categoryScores, + overallStats: { + completenessScore: 85, // Safe default value + totalApplicable: evaluationData.selectedCategories.length, + capabilityApplicable: capabilityCategories.length, + riskApplicable: riskCategories.length, + strongCategories, + adequateCategories, + weakCategories, + insufficientCategories, + }, + } + + console.log("[v0] Final evaluationJson:", evaluationJson) + + try { + console.log("[v0] Creating blob and download") + const blob = new Blob([JSON.stringify(evaluationJson, null, 2)], { type: "application/json" }) + const url = URL.createObjectURL(blob) + const a = document.createElement("a") + a.href = url + a.download = `${evaluationId}.json` + document.body.appendChild(a) + a.click() + document.body.removeChild(a) + URL.revokeObjectURL(url) + + console.log("[v0] Download completed successfully") + alert( + `Evaluation saved as ${evaluationId}.json. Please upload this file to the public/evaluations/ directory to see it on the homepage.`, + ) + onBack?.() + } catch (error) { + console.error("[v0] Error saving evaluation:", error) + alert("Error saving evaluation. Please try again.") + } + } + + const renderCurrentStep = () => { + switch (currentStep) { + case "system-info": + return + case "categories": + return ( + + ) + case "evaluation": + return ( + handleCategoryComplete(categoryId, score)} + onComplete={() => setCurrentStep("results")} + /> + ) + case "results": + return ( + + ) + default: + return null + } + } + + return ( +
+ {/* Header */} +
+
+
+
+ {onBack && ( + + )} +
+

New Eval Card

+

Create comprehensive AI system evaluation card

+
+
+
+
+

Overall Progress

+
+ + {Math.round(getOverallProgress())}% +
+
+ {evaluationData.systemInfo && ( + + {evaluationData.systemInfo.name} + + )} + +
+
+
+
+ + {/* Step tabs navigation */} +
+
+
+ {steps.map((step) => { + const isActive = currentStep === step.id + const isCompleted = + (step.id === "system-info" && evaluationData.systemInfo) || + (step.id === "categories" && evaluationData.selectedCategories.length > 0) || + (step.id === "evaluation" && Object.keys(evaluationData.categoryScores).length > 0) || + (step.id === "results" && currentStep === "results") + + return ( +
+
+ {step.number} +
+ {step.label} +
+ ) + })} +
+
+
+ +
{renderCurrentStep()}
+
+ ) +} diff --git a/components/category-evaluation.tsx b/components/category-evaluation.tsx new file mode 100644 index 0000000000000000000000000000000000000000..c0d9ae4025fe3fccb05142495ec2d749e3943dbb --- /dev/null +++ b/components/category-evaluation.tsx @@ -0,0 +1,934 @@ +"use client" + +import { useState, useEffect, useMemo } from "react" +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" +import { Button } from "@/components/ui/button" +import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group" +import { Label } from "@/components/ui/label" +import { Textarea } from "@/components/ui/textarea" +import { Input } from "@/components/ui/input" +import { Badge } from "@/components/ui/badge" +import { Separator } from "@/components/ui/separator" +import type { CategoryScore } from "@/components/ai-evaluation-dashboard" +import { HelpCircle, CheckCircle, Plus, Trash2 } from "lucide-react" +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip" +import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS, SOURCE_TYPES, ADDITIONAL_ASPECTS_SECTION, getFieldPlaceholder, getHint } from "@/lib/category-data" + +// The detailed per-category and per-question hints, plus recommended placeholders, +// are centralized in `lib/category-data.ts`. This component uses the exported +// helpers `getHint` and `getFieldPlaceholder` and the question lists. + +const CustomFieldComponent = ({ + questionId, + fieldType, + value, + onChange, +}: { + questionId: string + fieldType: string + value: string + onChange: (value: string) => void +}) => { + const getFieldConfig = (questionId: string, fieldType: string) => { + const configs: Record> = { + A2: { + thresholds: { label: "Quantitative Thresholds", placeholder: "e.g., >85% accuracy, <0.1 error rate" }, + thresholdSource: { + label: "Threshold Source", + placeholder: "e.g., industry standard, research paper, policy requirement", + }, + passFail: { label: "Pass/Fail Determination", placeholder: "e.g., Pass - exceeded 85% threshold" }, + }, + A3: { + comparativeScores: { + label: "Comparative Scores", + placeholder: "e.g., Our model: 87.2%, GPT-4: 85.1%, Previous version: 82.3%", + }, + baselineType: { label: "Baseline Type", placeholder: "e.g., SOTA, previous version, industry standard" }, + significance: { label: "Statistical Significance", placeholder: "e.g., p<0.05, 95% CI: [1.2, 3.8]" }, + }, + A4: { + testTypes: { label: "Test Types", placeholder: "e.g., adversarial attacks, load testing, distribution shift" }, + failureRates: { label: "Failure/Degradation Rates", placeholder: "e.g., 15% failure under adversarial inputs" }, + robustnessMetrics: { + label: "Robustness Metrics", + placeholder: "e.g., attack success rate, performance drop %", + }, + }, + A5: { + liveMetrics: { label: "Live Metrics Tracked", placeholder: "e.g., error rates, latency, drift detection" }, + samplingCadence: { label: "Sampling Cadence", placeholder: "e.g., every 1000 requests, hourly, daily" }, + alertThresholds: { label: "Alert Thresholds", placeholder: "e.g., >5% error rate, >500ms latency" }, + }, + A6: { + procedure: { + label: "Contamination Check Procedure", + placeholder: "e.g., n-gram overlap analysis, URL deduplication", + }, + contaminationRate: { + label: "Contamination Rate", + placeholder: "e.g., <1% overlap detected, 0.3% exact matches", + }, + mitigations: { label: "Mitigations Taken", placeholder: "e.g., removed overlapping samples, used holdout set" }, + }, + A7: { + comparisonSystems: { label: "Comparison Systems", placeholder: "e.g., GPT-4, Claude-3, Gemini Pro" }, + evaluationConditions: { + label: "Evaluation Conditions", + placeholder: "e.g., same prompts, temperature=0, identical hardware", + }, + relativeMetrics: { + label: "Relative Performance Metrics", + placeholder: "e.g., 15% better accuracy, 2x faster inference", + }, + }, + B1: { + scope: { + label: "Evaluation Scope", + placeholder: "e.g., measures reasoning capability in mathematical contexts", + }, + successFailureDefinitions: { + label: "Success/Failure Definitions", + placeholder: "e.g., success = >80% on grade-level problems", + }, + hypotheses: { label: "Hypotheses Being Tested", placeholder: "e.g., model can solve multi-step word problems" }, + }, + B2: { + replicationPackage: { + label: "Replication Package", + placeholder: "e.g., GitHub repo with code, configs, prompts", + }, + accessLevel: { label: "Access Level", placeholder: "e.g., public, access-controlled, internal only" }, + proxies: { label: "Proxies (if not shareable)", placeholder: "e.g., synthetic examples, anonymized data" }, + }, + B5: { + reviewers: { label: "Reviewers", placeholder: "e.g., domain experts, affected user groups, ethics board" }, + feedbackChanges: { + label: "Changes from Feedback", + placeholder: "e.g., added bias metrics, revised interpretation", + }, + disagreements: { + label: "Unresolved Disagreements", + placeholder: "e.g., threshold levels, risk severity ratings", + }, + }, + B6: { + uncertaintyDisclosure: { + label: "Uncertainty Disclosure", + placeholder: "e.g., error bars, confidence intervals, variance across runs", + }, + axesConsistency: { label: "Axes Consistency", placeholder: "e.g., consistent 0-100 scale, no truncated axes" }, + sampleSizes: { label: "Sample Sizes", placeholder: "e.g., n=1000 test samples, 5 random seeds" }, + selectionCriteria: { label: "Selection Criteria", placeholder: "e.g., all results shown, no cherry-picking" }, + }, + B8: { + triggers: { + label: "Re-evaluation Triggers", + placeholder: "e.g., model updates, data drift >5%, security incidents", + }, + versionedSpecs: { label: "Versioned Eval Specs", placeholder: "e.g., eval spec v2.1, change log maintained" }, + auditTrail: { label: "Audit Trail", placeholder: "e.g., all changes logged with timestamps and rationale" }, + mitigationProtocols: { + label: "Mitigation Protocols", + placeholder: "e.g., automated rollback, manual review process", + }, + retestProcedures: { + label: "Retest Procedures", + placeholder: "e.g., full eval suite after fixes, regression testing", + }, + }, + } + + return configs[questionId]?.[fieldType] || { label: fieldType, placeholder: "" } + } + + const config = getFieldConfig(questionId, fieldType) + + return ( +
+ +