Avijit Ghosh commited on
Commit
49d5ba7
·
1 Parent(s): 7016d85

fixed a lot of bugs, centralized schema

Browse files
app/evaluation/[id]/page.client.tsx CHANGED
@@ -6,9 +6,8 @@ import { Button } from "@/components/ui/button"
6
  import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"
7
  import { Badge } from "@/components/ui/badge"
8
  import { ArrowLeft, Download, Eye, EyeOff } from "lucide-react"
9
- import { CATEGORIES } from "@/lib/category-data"
10
  import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"
11
- import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS } from "@/lib/category-data"
12
  import { naReasonForCategoryFromEval } from "@/lib/na-utils"
13
 
14
  const loadEvaluationDetails = async (id: string) => {
@@ -50,12 +49,12 @@ export default function EvaluationDetailsPage() {
50
  const toggleCategoryVisibility = (id: string) => setVisibleCategories((p) => ({ ...p, [id]: !p[id] }))
51
  const selectAll = () => {
52
  const map: Record<string, boolean> = {}
53
- CATEGORIES.forEach((c) => (map[c.id] = true))
54
  setVisibleCategories(map)
55
  }
56
  const deselectAll = () => {
57
  const map: Record<string, boolean> = {}
58
- CATEGORIES.forEach((c) => (map[c.id] = false))
59
  setVisibleCategories(map)
60
  }
61
 
@@ -77,7 +76,7 @@ export default function EvaluationDetailsPage() {
77
  if (evaluation) {
78
  const init: Record<string, boolean> = {}
79
  // default: eyes open (visible) for all categories unless explicitly NA
80
- CATEGORIES.forEach((c) => {
81
  const na = naReasonForCategory(c.id)
82
  init[c.id] = !na
83
  })
@@ -90,11 +89,47 @@ export default function EvaluationDetailsPage() {
90
  const naReasonForCategory = (categoryId: string): string | undefined => {
91
  const catEval = evaluation?.categoryEvaluations?.[categoryId]
92
  if (!catEval) return undefined
93
- const benchmarkQs = BENCHMARK_QUESTIONS.map((q) => q.id)
94
- const processQs = PROCESS_QUESTIONS.map((q) => q.id)
95
  return naReasonForCategoryFromEval(catEval, benchmarkQs, processQs)
96
  }
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  useEffect(() => {
99
  try {
100
  localStorage.setItem(STORAGE_KEY, JSON.stringify(visibleCategories))
@@ -197,7 +232,7 @@ export default function EvaluationDetailsPage() {
197
  {/* compute applicable count as non-NA categories from the full CATEGORIES list */}
198
  <CardTitle>
199
  Applicable Categories ({
200
- CATEGORIES.filter((c) => {
201
  const sel = new Set(evaluation.selectedCategories || [])
202
  // treat as applicable only when selected and not explicitly NA
203
  const na = naReasonForCategory(c.id)
@@ -220,7 +255,7 @@ export default function EvaluationDetailsPage() {
220
  <div>
221
  <div className="text-sm font-medium mb-2">Capabilities</div>
222
  <div className="flex flex-col gap-2">
223
- {CATEGORIES.filter((c: any) => c.type === "capability").map((category: any) => {
224
  const sel = new Set(evaluation.selectedCategories || [])
225
  const naReason = naReasonForCategory(category.id)
226
  const isSelected = sel.has(category.id)
@@ -256,14 +291,14 @@ export default function EvaluationDetailsPage() {
256
  )}
257
  <span className="ml-2">
258
  {isNA ? (
259
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/30 text-muted-foreground border-2 border-purple-500">
260
- {category.name}
261
- </span>
262
- ) : (
263
- <Badge variant="secondary" className="cursor-pointer">
264
- {category.name}
265
- </Badge>
266
- )}
267
  </span>
268
  </label>
269
  )
@@ -274,7 +309,7 @@ export default function EvaluationDetailsPage() {
274
  <div>
275
  <div className="text-sm font-medium mb-2">Risks</div>
276
  <div className="flex flex-col gap-2">
277
- {CATEGORIES.filter((c: any) => c.type === "risk").map((category: any) => {
278
  const sel = new Set(evaluation.selectedCategories || [])
279
  const naReason = naReasonForCategory(category.id)
280
  const isSelected = sel.has(category.id)
@@ -311,11 +346,11 @@ export default function EvaluationDetailsPage() {
311
  <span className="ml-2">
312
  {isNA ? (
313
  <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/30 text-muted-foreground border-2 border-red-500">
314
- {category.name}
315
  </span>
316
  ) : (
317
  <Badge variant="destructive" className="cursor-pointer">
318
- {category.name}
319
  </Badge>
320
  )}
321
  </span>
@@ -337,25 +372,25 @@ export default function EvaluationDetailsPage() {
337
  <div className="grid grid-cols-2 md:grid-cols-4 gap-4">
338
  <div className="text-center p-4 bg-green-50 dark:bg-green-950 rounded-lg">
339
  <div className="text-2xl font-bold text-green-700 dark:text-green-300">
340
- {evaluation.overallStats?.strongCategories?.length || 0}
341
  </div>
342
  <div className="text-sm text-green-600 dark:text-green-400">Strong</div>
343
  </div>
344
  <div className="text-center p-4 bg-blue-50 dark:bg-blue-950 rounded-lg">
345
  <div className="text-2xl font-bold text-blue-700 dark:text-blue-300">
346
- {evaluation.overallStats?.adequateCategories?.length || 0}
347
  </div>
348
  <div className="text-sm text-blue-600 dark:text-blue-400">Adequate</div>
349
  </div>
350
  <div className="text-center p-4 bg-yellow-50 dark:bg-yellow-950 rounded-lg">
351
  <div className="text-2xl font-bold text-yellow-700 dark:text-yellow-300">
352
- {evaluation.overallStats?.weakCategories?.length || 0}
353
  </div>
354
  <div className="text-sm text-yellow-600 dark:text-yellow-400">Weak</div>
355
  </div>
356
  <div className="text-center p-4 bg-red-50 dark:bg-red-950 rounded-lg">
357
  <div className="text-2xl font-bold text-red-700 dark:text-red-300">
358
- {evaluation.overallStats?.insufficientCategories?.length || 0}
359
  </div>
360
  <div className="text-sm text-red-600 dark:text-red-400">Insufficient</div>
361
  </div>
@@ -374,7 +409,7 @@ export default function EvaluationDetailsPage() {
374
  {[...(evaluation.overallStats?.insufficientCategories || []), ...(evaluation.overallStats?.weakCategories || [])]
375
  .filter(Boolean)
376
  .map((catId: string) => {
377
- const category = CATEGORIES.find((c) => c.id === catId)
378
  return (
379
  <div key={catId} className="p-3 border rounded-md flex items-center justify-between">
380
  <div>
@@ -400,17 +435,17 @@ export default function EvaluationDetailsPage() {
400
  const na = naReasonForCategory(categoryId)
401
  return !na && (visibleCategories[categoryId] ?? true)
402
  })
403
- .map(([categoryId, data]: [string, any]) => {
404
- const category = CATEGORIES.find((c) => c.id === categoryId)
405
 
406
  // compute per-category score (yes out of applicable (yes+no)) across A & B
407
- const benchmarkQs = BENCHMARK_QUESTIONS.map((q) => q.id)
408
- const processQs = PROCESS_QUESTIONS.map((q) => q.id)
409
  let yesCount = 0
410
  let noCount = 0
411
  let naCount = 0
412
 
413
- for (const qid of benchmarkQs) {
414
  const raw = data.benchmarkAnswers?.[qid]
415
  const answers = Array.isArray(raw) ? raw : raw ? [raw] : []
416
  const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
@@ -422,7 +457,7 @@ export default function EvaluationDetailsPage() {
422
  else naCount++
423
  }
424
 
425
- for (const qid of processQs) {
426
  const raw = data.processAnswers?.[qid]
427
  const answers = Array.isArray(raw) ? raw : raw ? [raw] : []
428
  const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
@@ -437,10 +472,10 @@ export default function EvaluationDetailsPage() {
437
  const totalApplicable = yesCount + noCount
438
  const scoreText = totalApplicable > 0 ? `${yesCount}/${totalApplicable}` : "N/A"
439
  let rating = "Unknown"
440
- if (evaluation.overallStats?.strongCategories?.includes(categoryId)) rating = "Strong"
441
- else if (evaluation.overallStats?.adequateCategories?.includes(categoryId)) rating = "Adequate"
442
- else if (evaluation.overallStats?.weakCategories?.includes(categoryId)) rating = "Weak"
443
- else if (evaluation.overallStats?.insufficientCategories?.includes(categoryId)) rating = "Insufficient"
444
 
445
  const ratingClass =
446
  rating === "Strong"
@@ -480,183 +515,128 @@ export default function EvaluationDetailsPage() {
480
  <div>
481
  <h4 className="font-semibold mb-3">Part A: Benchmark & Testing</h4>
482
  <div className="space-y-4">
483
- {(() => {
484
- const entries = Object.entries(data.benchmarkSources || {}) as [string, any][]
485
- const yesItems: any[] = []
486
- const noItems: any[] = []
487
- const naItems: any[] = []
488
-
489
- // iterate the union of known source keys and answer keys so we show questions
490
- const canonicalKeys = BENCHMARK_QUESTIONS.map((q) => q.id)
491
- const answerKeys = Object.keys(data.benchmarkAnswers || {})
492
- const sourceKeys = Object.keys(data.benchmarkSources || {})
493
- const keySet = new Set<string>([...canonicalKeys, ...answerKeys, ...sourceKeys])
494
- for (const questionId of Array.from(keySet)) {
495
- const sources = data.benchmarkSources?.[questionId] || []
496
- const qText = BENCHMARK_QUESTIONS.find((x) => x.id === questionId)?.text || questionId
497
- const rawAnswer = data.benchmarkAnswers?.[questionId]
498
- const answers = Array.isArray(rawAnswer) ? rawAnswer : rawAnswer ? [rawAnswer] : []
499
- const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
500
- const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no")
501
- const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a")
502
-
503
- const reason =
504
- sources?.[0]?.scope || sources?.[0]?.description || data.additionalAspects || (hasNA ? "Not applicable" : undefined)
505
-
506
- if (hasYes) yesItems.push({ questionId, qText, sources })
507
- else if (hasNo) noItems.push({ questionId, qText })
508
- else if (hasNA) naItems.push({ questionId, qText, reason })
509
- else naItems.push({ questionId, qText, reason: reason || "Not applicable" })
510
- }
511
 
512
  return (
513
- <>
514
- {yesItems.map((it) => {
515
- const key = `bench-${categoryId}-${it.questionId}`
516
- return (
517
- <div key={it.questionId} className="border rounded-lg p-4">
518
- <div
519
- role="button"
520
- tabIndex={0}
521
- onClick={() => toggleNegatives(key)}
522
- className="flex items-center gap-2 mb-2 justify-between cursor-pointer"
523
- >
524
- <div className="flex items-center gap-3">
525
- <span className="font-medium">{it.questionId}:</span>
526
- <div className="text-sm">{it.qText}</div>
527
- </div>
528
- <div className="flex items-center gap-2">
529
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-green-100 text-green-700">yes</span>
530
- </div>
531
- </div>
532
-
533
- {expandedNegatives[key] && (() => {
534
- const cards = (it.sources || []).flatMap((src: any) => {
535
- const names = String(src.benchmarkName || '')
536
- .split(',')
537
- .map((s: string) => s.trim())
538
- .filter(Boolean)
539
-
540
- const scoreParts = String(src.score || '')
541
- .split(',')
542
- .map((s: string) => s.trim())
543
- .filter(Boolean)
544
-
545
- return (names.length > 0 ? names : ['Benchmark']).map((name: string, idx: number) => {
546
- // determine score for this benchmark (positional or by name) or fallback to any numeric
547
- let scoreNum: number | undefined
548
- if (scoreParts.length === names.length && scoreParts[idx]) {
549
- const m = scoreParts[idx].match(/(\d+(?:\.\d+)?)/)
550
- if (m) scoreNum = parseFloat(m[1])
551
- } else if (scoreParts.length > 0) {
552
- const byName = scoreParts.find((p: string) => p.toLowerCase().includes(name.toLowerCase()))
553
- const m = (byName || scoreParts[0]).match(/(\d+(?:\.\d+)?)/)
554
- if (m) scoreNum = parseFloat(m[1])
555
- } else if (src?.score) {
556
- const m = String(src.score).match(/(\d+(?:\.\d+)?)/)
557
- if (m) scoreNum = parseFloat(m[1])
558
- }
559
-
560
- return (
561
- <div key={`${it.questionId}-${name}-${idx}`} className="p-4 border rounded-lg bg-background">
562
- <div className="flex items-start justify-between">
563
- <div className="text-xs inline-flex items-center rounded-full px-2 py-1 bg-indigo-50 text-indigo-700">Percentage</div>
564
- <div className="text-2xl font-bold text-indigo-600">{scoreNum != null ? `${scoreNum}%` : '—'}</div>
565
- </div>
566
-
567
- <div className="mt-3 text-lg font-semibold">{name}</div>
568
-
569
- {scoreNum != null && (
570
- <div className="mt-3">
571
- <div className="h-2 bg-gray-200 rounded-full overflow-hidden">
572
- <div className="h-2 bg-indigo-500" style={{ width: `${Math.max(0, Math.min(100, scoreNum))}%` }} />
573
- </div>
574
- </div>
575
- )}
576
-
577
- <div className="mt-3 space-y-2 text-sm">
578
- <div>
579
- <span className="text-muted-foreground">Source:</span>{' '}
580
- {src.url ? (
581
- <a className="text-primary underline" href={src.url} target="_blank" rel="noreferrer">
582
- {src.url}
583
- </a>
584
- ) : (
585
- '—'
586
- )}
587
- </div>
588
- <div>
589
- <span className="text-muted-foreground">Type:</span> {src.sourceType || src.documentType || '—'}
590
- </div>
591
- {src.metrics && (
592
- <div>
593
- <span className="text-muted-foreground">Metric:</span> {src.metrics}
594
- </div>
595
- )}
596
- {src.confidenceInterval && (
597
- <div>
598
- <span className="text-muted-foreground">Confidence Interval:</span> {src.confidenceInterval}
599
- </div>
600
- )}
601
- {src.description && (
602
- <div className="mt-2 p-2 bg-muted/40 rounded text-sm">{src.description}</div>
603
- )}
604
- </div>
605
- </div>
606
- )
607
- })
608
- })
609
-
610
- if (cards.length === 0) return <div className="text-sm text-muted-foreground">No benchmark details available.</div>
611
-
612
- return <div className="mt-3 grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">{cards}</div>
613
- })()}
614
- </div>
615
- )
616
- })}
617
-
618
- {noItems.map((it) => (
619
- <div key={it.questionId} className="border rounded-lg p-4">
620
- <div className="flex items-center gap-2 mb-2 justify-between">
621
- <div className="flex items-center gap-3">
622
- <span className="font-medium">{it.questionId}:</span>
623
- <div className="text-sm">{it.qText}</div>
624
- </div>
625
- <div>
626
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-red-100 text-red-700">no</span>
627
- </div>
628
- </div>
629
  </div>
630
- ))}
631
-
632
- {naItems.length > 0 && (
633
- <div className="border rounded-lg p-3">
634
- <div className="flex items-center justify-between">
635
- <div className="font-medium">Not applicable ({naItems.length})</div>
636
- <button onClick={() => toggleNegatives(`bench-na-${categoryId}`)} className="text-sm text-primary underline">
637
- {expandedNegatives[`bench-na-${categoryId}`] ? "Hide" : "Show"}
638
- </button>
639
- </div>
640
-
641
- {expandedNegatives[`bench-na-${categoryId}`] && (
642
- <div className="mt-3 space-y-2">
643
- {naItems.map((it) => (
644
- <div key={it.questionId} className="p-2 bg-muted rounded">
645
- <div className="flex items-center justify-between">
646
- <div className="text-sm">
647
- <span className="font-medium">{it.questionId}:</span> {it.qText}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  </div>
649
- <div className="text-xs text-muted-foreground">Reason: {it.reason}</div>
650
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  </div>
652
- ))}
653
- </div>
654
- )}
655
- </div>
 
 
 
 
 
 
 
 
656
  )}
657
- </>
658
  )
659
- })()}
660
  </div>
661
  </div>
662
  )}
@@ -666,121 +646,69 @@ export default function EvaluationDetailsPage() {
666
  <div>
667
  <h4 className="font-semibold mb-3">Part B: Documentation & Process</h4>
668
  <div className="space-y-4">
669
- {(() => {
670
- const entries = Object.entries(data.processSources || {}) as [string, any][]
671
- const yesItems: any[] = []
672
- const noItems: any[] = []
673
- const naItems: any[] = []
674
-
675
- const canonicalKeys = PROCESS_QUESTIONS.map((q) => q.id)
676
- const answerKeys = Object.keys(data.processAnswers || {})
677
- const sourceKeys = Object.keys(data.processSources || {})
678
- const keySet = new Set<string>([...canonicalKeys, ...answerKeys, ...sourceKeys])
679
- for (const questionId of Array.from(keySet)) {
680
- const sources = data.processSources?.[questionId] || []
681
- const qText = PROCESS_QUESTIONS.find((x) => x.id === questionId)?.text || questionId
682
- const rawAnswer = data.processAnswers?.[questionId]
683
- const answers = Array.isArray(rawAnswer) ? rawAnswer : rawAnswer ? [rawAnswer] : []
684
- const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
685
- const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no")
686
- const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a")
687
-
688
- const reason = sources?.[0]?.scope || sources?.[0]?.description || data.additionalAspects || (hasNA ? "Not applicable" : undefined)
689
-
690
- if (hasYes) yesItems.push({ questionId, qText, sources })
691
- else if (hasNo) noItems.push({ questionId, qText })
692
- else if (hasNA) naItems.push({ questionId, qText, reason })
693
- else naItems.push({ questionId, qText, reason: reason || "Not applicable" })
694
- }
695
 
696
  return (
697
- <>
698
- {yesItems.map((it) => {
699
- const key = `proc-${categoryId}-${it.questionId}`
700
- return (
701
- <div key={it.questionId} className="border rounded-lg p-4">
702
- <div
703
- role="button"
704
- tabIndex={0}
705
- onClick={() => toggleNegatives(key)}
706
- className="flex items-center gap-2 mb-2 justify-between cursor-pointer"
707
- >
708
- <div className="flex items-center gap-3">
709
- <span className="font-medium">{it.questionId}:</span>
710
- <div className="text-sm">{it.qText}</div>
711
- </div>
712
- <div className="flex items-center gap-2">
713
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-green-100 text-green-700">yes</span>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  </div>
715
- </div>
716
-
717
- {expandedNegatives[key] && (
718
- <div className="mt-3 space-y-3">
719
- {(it.sources || []).map((src: any, i: number) => (
720
- <div key={i} className="p-3 bg-muted rounded">
721
- <div className="grid grid-cols-1 gap-2 text-sm">
722
- <div>
723
- <span className="text-muted-foreground">URL:</span> {src?.url || '—'}
724
- </div>
725
- <div>
726
- <span className="text-muted-foreground">Document Type:</span> {src?.documentType || src?.sourceType || '—'}
727
- </div>
728
- </div>
729
- {src?.description && (
730
- <div className="mt-2 text-sm">
731
- <span className="text-muted-foreground">Description:</span> {src.description}
732
- </div>
733
- )}
734
- </div>
735
- ))}
736
  </div>
737
- )}
738
- </div>
739
- )
740
- })}
741
-
742
- {noItems.map((it) => (
743
- <div key={it.questionId} className="border rounded-lg p-4">
744
- <div className="flex items-center gap-2 mb-2 justify-between">
745
- <div className="flex items-center gap-3">
746
- <span className="font-medium">{it.questionId}:</span>
747
- <div className="text-sm">{it.qText}</div>
748
- </div>
749
- <div>
750
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-red-100 text-red-700">no</span>
751
- </div>
752
- </div>
753
- </div>
754
- ))}
755
-
756
- {naItems.length > 0 && (
757
- <div className="border rounded-lg p-3">
758
- <div className="flex items-center justify-between">
759
- <div className="font-medium">Not applicable ({naItems.length})</div>
760
- <button onClick={() => toggleNegatives(`proc-na-${categoryId}`)} className="text-sm text-primary underline">
761
- {expandedNegatives[`proc-na-${categoryId}`] ? "Hide" : "Show"}
762
- </button>
763
- </div>
764
-
765
- {expandedNegatives[`proc-na-${categoryId}`] && (
766
- <div className="mt-3 space-y-2">
767
- {naItems.map((it) => (
768
- <div key={it.questionId} className="p-2 bg-muted rounded">
769
- <div className="flex items-center justify-between">
770
- <div className="text-sm">
771
- <span className="font-medium">{it.questionId}:</span> {it.qText}
772
- </div>
773
- <div className="text-xs text-muted-foreground">Reason: {it.reason}</div>
774
- </div>
775
  </div>
776
- ))}
777
  </div>
778
- )}
779
  </div>
780
  )}
781
- </>
 
 
 
 
782
  )
783
- })()}
784
  </div>
785
  </div>
786
  )}
 
6
  import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"
7
  import { Badge } from "@/components/ui/badge"
8
  import { ArrowLeft, Download, Eye, EyeOff } from "lucide-react"
9
+ import { getAllCategories, getCategoryById, getBenchmarkQuestions, getProcessQuestions } from "@/lib/schema"
10
  import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"
 
11
  import { naReasonForCategoryFromEval } from "@/lib/na-utils"
12
 
13
  const loadEvaluationDetails = async (id: string) => {
 
49
  const toggleCategoryVisibility = (id: string) => setVisibleCategories((p) => ({ ...p, [id]: !p[id] }))
50
  const selectAll = () => {
51
  const map: Record<string, boolean> = {}
52
+ getAllCategories().forEach((c) => (map[c.id] = true))
53
  setVisibleCategories(map)
54
  }
55
  const deselectAll = () => {
56
  const map: Record<string, boolean> = {}
57
+ getAllCategories().forEach((c) => (map[c.id] = false))
58
  setVisibleCategories(map)
59
  }
60
 
 
76
  if (evaluation) {
77
  const init: Record<string, boolean> = {}
78
  // default: eyes open (visible) for all categories unless explicitly NA
79
+ getAllCategories().forEach((c) => {
80
  const na = naReasonForCategory(c.id)
81
  init[c.id] = !na
82
  })
 
89
  const naReasonForCategory = (categoryId: string): string | undefined => {
90
  const catEval = evaluation?.categoryEvaluations?.[categoryId]
91
  if (!catEval) return undefined
92
+ const benchmarkQs = getBenchmarkQuestions().map((q) => q.id)
93
+ const processQs = getProcessQuestions().map((q) => q.id)
94
  return naReasonForCategoryFromEval(catEval, benchmarkQs, processQs)
95
  }
96
 
97
+ // Compute overall stats from evaluation data dynamically
98
+ const computedStats = (() => {
99
+ const strongCategories: string[] = []
100
+ const adequateCategories: string[] = []
101
+ const weakCategories: string[] = []
102
+ const insufficientCategories: string[] = []
103
+
104
+ const allEntries = evaluation?.categoryEvaluations || {}
105
+ for (const [catId, catData] of Object.entries(allEntries)) {
106
+ // Count yes/no across A & B
107
+ let yes = 0
108
+ let no = 0
109
+ let na = 0
110
+ const bQs = getBenchmarkQuestions().map((q) => q.id)
111
+ const pQs = getProcessQuestions().map((q) => q.id)
112
+ for (const qid of [...bQs, ...pQs]) {
113
+ const raw = (catData as any).benchmarkAnswers?.[qid] ?? (catData as any).processAnswers?.[qid]
114
+ const arr = Array.isArray(raw) ? raw : raw ? [raw] : []
115
+ const hasYes = arr.some((a: string) => String(a).toLowerCase() === "yes")
116
+ const hasNo = arr.some((a: string) => String(a).toLowerCase() === "no")
117
+ if (hasYes) yes++
118
+ else if (hasNo) no++
119
+ else na++
120
+ }
121
+
122
+ const totalApplicable = yes + no
123
+ const ratio = totalApplicable > 0 ? yes / totalApplicable : 0
124
+ if (ratio >= 0.8) strongCategories.push(catId)
125
+ else if (ratio >= 0.6) adequateCategories.push(catId)
126
+ else if (ratio >= 0.4) weakCategories.push(catId)
127
+ else insufficientCategories.push(catId)
128
+ }
129
+
130
+ return { strongCategories, adequateCategories, weakCategories, insufficientCategories }
131
+ })()
132
+
133
  useEffect(() => {
134
  try {
135
  localStorage.setItem(STORAGE_KEY, JSON.stringify(visibleCategories))
 
232
  {/* compute applicable count as non-NA categories from the full CATEGORIES list */}
233
  <CardTitle>
234
  Applicable Categories ({
235
+ getAllCategories().filter((c) => {
236
  const sel = new Set(evaluation.selectedCategories || [])
237
  // treat as applicable only when selected and not explicitly NA
238
  const na = naReasonForCategory(c.id)
 
255
  <div>
256
  <div className="text-sm font-medium mb-2">Capabilities</div>
257
  <div className="flex flex-col gap-2">
258
+ {getAllCategories().filter((c) => c.type === "capability").map((category) => {
259
  const sel = new Set(evaluation.selectedCategories || [])
260
  const naReason = naReasonForCategory(category.id)
261
  const isSelected = sel.has(category.id)
 
291
  )}
292
  <span className="ml-2">
293
  {isNA ? (
294
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/30 text-muted-foreground border-2 border-purple-500">
295
+ {getCategoryById(category.id)?.name || category.name}
296
+ </span>
297
+ ) : (
298
+ <Badge variant="secondary" className="cursor-pointer">
299
+ {getCategoryById(category.id)?.name || category.name}
300
+ </Badge>
301
+ )}
302
  </span>
303
  </label>
304
  )
 
309
  <div>
310
  <div className="text-sm font-medium mb-2">Risks</div>
311
  <div className="flex flex-col gap-2">
312
+ {getAllCategories().filter((c) => c.type === "risk").map((category) => {
313
  const sel = new Set(evaluation.selectedCategories || [])
314
  const naReason = naReasonForCategory(category.id)
315
  const isSelected = sel.has(category.id)
 
346
  <span className="ml-2">
347
  {isNA ? (
348
  <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/30 text-muted-foreground border-2 border-red-500">
349
+ {getCategoryById(category.id)?.name || category.name}
350
  </span>
351
  ) : (
352
  <Badge variant="destructive" className="cursor-pointer">
353
+ {getCategoryById(category.id)?.name || category.name}
354
  </Badge>
355
  )}
356
  </span>
 
372
  <div className="grid grid-cols-2 md:grid-cols-4 gap-4">
373
  <div className="text-center p-4 bg-green-50 dark:bg-green-950 rounded-lg">
374
  <div className="text-2xl font-bold text-green-700 dark:text-green-300">
375
+ {computedStats.strongCategories.length}
376
  </div>
377
  <div className="text-sm text-green-600 dark:text-green-400">Strong</div>
378
  </div>
379
  <div className="text-center p-4 bg-blue-50 dark:bg-blue-950 rounded-lg">
380
  <div className="text-2xl font-bold text-blue-700 dark:text-blue-300">
381
+ {computedStats.adequateCategories.length}
382
  </div>
383
  <div className="text-sm text-blue-600 dark:text-blue-400">Adequate</div>
384
  </div>
385
  <div className="text-center p-4 bg-yellow-50 dark:bg-yellow-950 rounded-lg">
386
  <div className="text-2xl font-bold text-yellow-700 dark:text-yellow-300">
387
+ {computedStats.weakCategories.length}
388
  </div>
389
  <div className="text-sm text-yellow-600 dark:text-yellow-400">Weak</div>
390
  </div>
391
  <div className="text-center p-4 bg-red-50 dark:bg-red-950 rounded-lg">
392
  <div className="text-2xl font-bold text-red-700 dark:text-red-300">
393
+ {computedStats.insufficientCategories.length}
394
  </div>
395
  <div className="text-sm text-red-600 dark:text-red-400">Insufficient</div>
396
  </div>
 
409
  {[...(evaluation.overallStats?.insufficientCategories || []), ...(evaluation.overallStats?.weakCategories || [])]
410
  .filter(Boolean)
411
  .map((catId: string) => {
412
+ const category = getCategoryById(catId)
413
  return (
414
  <div key={catId} className="p-3 border rounded-md flex items-center justify-between">
415
  <div>
 
435
  const na = naReasonForCategory(categoryId)
436
  return !na && (visibleCategories[categoryId] ?? true)
437
  })
438
+ .map(([categoryId, data]: [string, any]) => {
439
+ const category = getCategoryById(categoryId)
440
 
441
  // compute per-category score (yes out of applicable (yes+no)) across A & B
442
+ const benchmarkQs = getBenchmarkQuestions().map((q) => q.id)
443
+ const processQs = getProcessQuestions().map((q) => q.id)
444
  let yesCount = 0
445
  let noCount = 0
446
  let naCount = 0
447
 
448
+ for (const qid of getBenchmarkQuestions().map((q) => q.id)) {
449
  const raw = data.benchmarkAnswers?.[qid]
450
  const answers = Array.isArray(raw) ? raw : raw ? [raw] : []
451
  const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
 
457
  else naCount++
458
  }
459
 
460
+ for (const qid of getProcessQuestions().map((q) => q.id)) {
461
  const raw = data.processAnswers?.[qid]
462
  const answers = Array.isArray(raw) ? raw : raw ? [raw] : []
463
  const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
 
472
  const totalApplicable = yesCount + noCount
473
  const scoreText = totalApplicable > 0 ? `${yesCount}/${totalApplicable}` : "N/A"
474
  let rating = "Unknown"
475
+ if (computedStats.strongCategories.includes(categoryId)) rating = "Strong"
476
+ else if (computedStats.adequateCategories.includes(categoryId)) rating = "Adequate"
477
+ else if (computedStats.weakCategories.includes(categoryId)) rating = "Weak"
478
+ else if (computedStats.insufficientCategories.includes(categoryId)) rating = "Insufficient"
479
 
480
  const ratingClass =
481
  rating === "Strong"
 
515
  <div>
516
  <h4 className="font-semibold mb-3">Part A: Benchmark & Testing</h4>
517
  <div className="space-y-4">
518
+ {getBenchmarkQuestions().map((q) => {
519
+ const questionId = q.id
520
+ const sources = data.benchmarkSources?.[questionId] || []
521
+ const qText = q.text
522
+ const rawAnswer = data.benchmarkAnswers?.[questionId]
523
+ const answers = Array.isArray(rawAnswer) ? rawAnswer : rawAnswer ? [rawAnswer] : []
524
+ const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
525
+ const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no")
526
+ const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a")
527
+
528
+ const key = `bench-${categoryId}-${questionId}`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
  return (
531
+ <div key={questionId} className="border rounded-lg p-4">
532
+ <div
533
+ role="button"
534
+ tabIndex={0}
535
+ onClick={() => toggleNegatives(key)}
536
+ className="flex items-center gap-2 mb-2 justify-between cursor-pointer"
537
+ >
538
+ <div className="flex items-center gap-3">
539
+ <span className="font-medium">{questionId}:</span>
540
+ <div className="text-sm">{qText}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  </div>
542
+ <div className="flex items-center gap-2">
543
+ {hasYes ? (
544
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-green-100 text-green-700">yes</span>
545
+ ) : hasNo ? (
546
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-red-100 text-red-700">no</span>
547
+ ) : (
548
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/20 text-muted-foreground">n/a</span>
549
+ )}
550
+ </div>
551
+ </div>
552
+
553
+ {hasYes && expandedNegatives[key] && (() => {
554
+ const cards = (sources || []).flatMap((src: any) => {
555
+ const names = String(src.benchmarkName || '')
556
+ .split(',')
557
+ .map((s: string) => s.trim())
558
+ .filter(Boolean)
559
+
560
+ const scoreParts = String(src.score || '')
561
+ .split(',')
562
+ .map((s: string) => s.trim())
563
+ .filter(Boolean)
564
+
565
+ return (names.length > 0 ? names : ['Benchmark']).map((name: string, idx: number) => {
566
+ let scoreNum: number | undefined
567
+ if (scoreParts.length === names.length && scoreParts[idx]) {
568
+ const m = scoreParts[idx].match(/(\d+(?:\.\d+)?)/)
569
+ if (m) scoreNum = parseFloat(m[1])
570
+ } else if (scoreParts.length > 0) {
571
+ const byName = scoreParts.find((p: string) => p.toLowerCase().includes(name.toLowerCase()))
572
+ const m = (byName || scoreParts[0]).match(/(\d+(?:\.\d+)?)/)
573
+ if (m) scoreNum = parseFloat(m[1])
574
+ } else if (src?.score) {
575
+ const m = String(src.score).match(/(\d+(?:\.\d+)?)/)
576
+ if (m) scoreNum = parseFloat(m[1])
577
+ }
578
+
579
+ return (
580
+ <div key={`${questionId}-${name}-${idx}`} className="p-4 border rounded-lg bg-background">
581
+ <div className="flex items-start justify-between">
582
+ <div className="text-xs inline-flex items-center rounded-full px-2 py-1 bg-indigo-50 text-indigo-700">Percentage</div>
583
+ <div className="text-2xl font-bold text-indigo-600">{scoreNum != null ? `${scoreNum}%` : '—'}</div>
584
+ </div>
585
+
586
+ <div className="mt-3 text-lg font-semibold">{name}</div>
587
+
588
+ {scoreNum != null && (
589
+ <div className="mt-3">
590
+ <div className="h-2 bg-gray-200 rounded-full overflow-hidden">
591
+ <div className="h-2 bg-indigo-500" style={{ width: `${Math.max(0, Math.min(100, scoreNum))}%` }} />
592
  </div>
 
593
  </div>
594
+ )}
595
+
596
+ <div className="mt-3 space-y-2 text-sm">
597
+ <div>
598
+ <span className="text-muted-foreground">Source:</span>{' '}
599
+ {src.url ? (
600
+ <a className="text-primary underline" href={src.url} target="_blank" rel="noreferrer">
601
+ {src.url}
602
+ </a>
603
+ ) : (
604
+ '—'
605
+ )}
606
+ </div>
607
+ <div>
608
+ <span className="text-muted-foreground">Type:</span> {src.sourceType || src.documentType || '—'}
609
+ </div>
610
+ {src.metrics && (
611
+ <div>
612
+ <span className="text-muted-foreground">Metric:</span> {src.metrics}
613
+ </div>
614
+ )}
615
+ {src.confidenceInterval && (
616
+ <div>
617
+ <span className="text-muted-foreground">Confidence Interval:</span> {src.confidenceInterval}
618
+ </div>
619
+ )}
620
+ {src.description && (
621
+ <div className="mt-2 p-2 bg-muted/40 rounded text-sm">{src.description}</div>
622
+ )}
623
  </div>
624
+ </div>
625
+ )
626
+ })
627
+ })
628
+
629
+ if (cards.length === 0) return <div className="text-sm text-muted-foreground">No benchmark details available.</div>
630
+
631
+ return <div className="mt-3 grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">{cards}</div>
632
+ })()}
633
+
634
+ {hasNA && (
635
+ <div className="mt-2 text-sm text-muted-foreground">Reason: {sources?.[0]?.description || data.additionalAspects || 'Not applicable'}</div>
636
  )}
637
+ </div>
638
  )
639
+ })}
640
  </div>
641
  </div>
642
  )}
 
646
  <div>
647
  <h4 className="font-semibold mb-3">Part B: Documentation & Process</h4>
648
  <div className="space-y-4">
649
+ {getProcessQuestions().map((q) => {
650
+ const questionId = q.id
651
+ const sources = data.processSources?.[questionId] || []
652
+ const qText = q.text
653
+ const rawAnswer = data.processAnswers?.[questionId]
654
+ const answers = Array.isArray(rawAnswer) ? rawAnswer : rawAnswer ? [rawAnswer] : []
655
+ const hasYes = answers.some((a: string) => String(a).toLowerCase() === "yes")
656
+ const hasNo = answers.some((a: string) => String(a).toLowerCase() === "no")
657
+ const hasNA = answers.length === 0 || answers.some((a: string) => String(a).toLowerCase().includes("not applicable") || String(a).toLowerCase() === "n/a")
658
+
659
+ const key = `proc-${categoryId}-${questionId}`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  return (
662
+ <div key={questionId} className="border rounded-lg p-4">
663
+ <div
664
+ role="button"
665
+ tabIndex={0}
666
+ onClick={() => toggleNegatives(key)}
667
+ className="flex items-center gap-2 mb-2 justify-between cursor-pointer"
668
+ >
669
+ <div className="flex items-center gap-3">
670
+ <span className="font-medium">{questionId}:</span>
671
+ <div className="text-sm">{qText}</div>
672
+ </div>
673
+ <div className="flex items-center gap-2">
674
+ {hasYes ? (
675
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-green-100 text-green-700">yes</span>
676
+ ) : hasNo ? (
677
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-red-100 text-red-700">no</span>
678
+ ) : (
679
+ <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/20 text-muted-foreground">n/a</span>
680
+ )}
681
+ </div>
682
+ </div>
683
+
684
+ {hasYes && expandedNegatives[key] && (
685
+ <div className="mt-3 space-y-3">
686
+ {(sources || []).map((src: any, i: number) => (
687
+ <div key={i} className="p-3 bg-muted rounded">
688
+ <div className="grid grid-cols-1 gap-2 text-sm">
689
+ <div>
690
+ <span className="text-muted-foreground">URL:</span> {src?.url || '—'}
691
+ </div>
692
+ <div>
693
+ <span className="text-muted-foreground">Document Type:</span> {src?.documentType || src?.sourceType || '—'}
694
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  </div>
696
+ {src?.description && (
697
+ <div className="mt-2 text-sm">
698
+ <span className="text-muted-foreground">Description:</span> {src.description}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  </div>
700
+ )}
701
  </div>
702
+ ))}
703
  </div>
704
  )}
705
+
706
+ {hasNA && (
707
+ <div className="mt-2 text-sm text-muted-foreground">Reason: {sources?.[0]?.description || data.additionalAspects || 'Not applicable'}</div>
708
+ )}
709
+ </div>
710
  )
711
+ })}
712
  </div>
713
  </div>
714
  )}
app/page.tsx CHANGED
@@ -6,7 +6,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@
6
  import { Plus, Moon, Sun, Filter, ArrowUpDown } from "lucide-react"
7
  import { useTheme } from "next-themes"
8
  import { EvaluationCard, type EvaluationCardData } from "@/components/evaluation-card"
9
- import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS } from "@/lib/category-data"
10
  import { AIEvaluationDashboard } from "@/components/ai-evaluation-dashboard"
11
 
12
  const loadEvaluationData = async (): Promise<EvaluationCardData[]> => {
@@ -36,7 +36,8 @@ const loadEvaluationData = async (): Promise<EvaluationCardData[]> => {
36
  id: data.id || `eval-${Date.now()}`,
37
  systemName: data.systemName || "Unknown System",
38
  provider: data.provider || "Unknown Provider",
39
- modality: data.modality || "Unknown",
 
40
  completedDate: data.evaluationDate || new Date().toISOString().split("T")[0],
41
  applicableCategories: data.overallStats?.totalApplicable || 0,
42
  completedCategories: data.overallStats?.totalApplicable || 0,
@@ -323,7 +324,7 @@ const loadEvaluationData = async (): Promise<EvaluationCardData[]> => {
323
  const isArray = Array.isArray(answer)
324
  const negative = answer === "no" || (isArray && (answer as any[]).includes("no"))
325
  const positive = answer === "yes" || (isArray && (answer as any[]).includes("yes"))
326
- const qText = BENCHMARK_QUESTIONS.find((x) => x.id === qid)?.text || qid
327
  if (positive) yesList.push(qText)
328
  if (negative) {
329
  const status = naMeta ? "na" : "no"
@@ -339,7 +340,7 @@ const loadEvaluationData = async (): Promise<EvaluationCardData[]> => {
339
  const isArray = Array.isArray(answer)
340
  const negative = answer === "no" || (isArray && (answer as any[]).includes("no"))
341
  const positive = answer === "yes" || (isArray && (answer as any[]).includes("yes"))
342
- const qText = PROCESS_QUESTIONS.find((x) => x.id === qid)?.text || qid
343
  if (positive) yesList.push(qText)
344
  if (negative) {
345
  const status = naMeta ? "na" : "no"
@@ -388,8 +389,12 @@ export default function HomePage() {
388
  }, [evaluationsData])
389
 
390
  const uniqueModalities = useMemo(() => {
391
- const modalities = [...new Set(evaluationsData.map((item) => item.modality))].sort()
392
- return modalities
 
 
 
 
393
  }, [evaluationsData])
394
 
395
  const filteredAndSortedEvaluations = useMemo(() => {
@@ -400,7 +405,10 @@ export default function HomePage() {
400
  }
401
 
402
  if (filterByModality !== "all") {
403
- filtered = filtered.filter((item) => item.modality === filterByModality)
 
 
 
404
  }
405
 
406
  filtered = [...filtered].sort((a, b) => {
 
6
  import { Plus, Moon, Sun, Filter, ArrowUpDown } from "lucide-react"
7
  import { useTheme } from "next-themes"
8
  import { EvaluationCard, type EvaluationCardData } from "@/components/evaluation-card"
9
+ import { getBenchmarkQuestions, getProcessQuestions } from "@/lib/schema"
10
  import { AIEvaluationDashboard } from "@/components/ai-evaluation-dashboard"
11
 
12
  const loadEvaluationData = async (): Promise<EvaluationCardData[]> => {
 
36
  id: data.id || `eval-${Date.now()}`,
37
  systemName: data.systemName || "Unknown System",
38
  provider: data.provider || "Unknown Provider",
39
+ inputModalities: data.inputModalities || ["Text"],
40
+ outputModalities: data.outputModalities || ["Text"],
41
  completedDate: data.evaluationDate || new Date().toISOString().split("T")[0],
42
  applicableCategories: data.overallStats?.totalApplicable || 0,
43
  completedCategories: data.overallStats?.totalApplicable || 0,
 
324
  const isArray = Array.isArray(answer)
325
  const negative = answer === "no" || (isArray && (answer as any[]).includes("no"))
326
  const positive = answer === "yes" || (isArray && (answer as any[]).includes("yes"))
327
+ const qText = getBenchmarkQuestions().find((x) => x.id === qid)?.text || qid
328
  if (positive) yesList.push(qText)
329
  if (negative) {
330
  const status = naMeta ? "na" : "no"
 
340
  const isArray = Array.isArray(answer)
341
  const negative = answer === "no" || (isArray && (answer as any[]).includes("no"))
342
  const positive = answer === "yes" || (isArray && (answer as any[]).includes("yes"))
343
+ const qText = getProcessQuestions().find((x) => x.id === qid)?.text || qid
344
  if (positive) yesList.push(qText)
345
  if (negative) {
346
  const status = naMeta ? "na" : "no"
 
389
  }, [evaluationsData])
390
 
391
  const uniqueModalities = useMemo(() => {
392
+ const modalities = new Set<string>()
393
+ evaluationsData.forEach((item) => {
394
+ item.inputModalities.forEach((mod) => modalities.add(mod))
395
+ item.outputModalities.forEach((mod) => modalities.add(mod))
396
+ })
397
+ return [...modalities].sort()
398
  }, [evaluationsData])
399
 
400
  const filteredAndSortedEvaluations = useMemo(() => {
 
405
  }
406
 
407
  if (filterByModality !== "all") {
408
+ filtered = filtered.filter((item) =>
409
+ item.inputModalities.includes(filterByModality) ||
410
+ item.outputModalities.includes(filterByModality)
411
+ )
412
  }
413
 
414
  filtered = [...filtered].sort((a, b) => {
components/ai-evaluation-dashboard.tsx CHANGED
@@ -10,15 +10,14 @@ import { CategorySelection } from "./category-selection"
10
  import { CategoryEvaluation } from "./category-evaluation"
11
  import { EvaluationForm } from "./evaluation-form"
12
  import { ResultsDashboard } from "./results-dashboard"
13
- import { CATEGORIES } from "@/lib/category-data"
14
 
15
  export type SystemInfo = {
16
  name: string
17
  url: string
18
  provider: string
19
- systemTypes: string[]
20
  deploymentContexts: string[]
21
- modality: string
22
  modelTag?: string
23
  knowledgeCutoff?: string
24
  modelType?: "foundational" | "fine-tuned" | "na"
@@ -42,6 +41,7 @@ export type EvaluationData = {
42
  selectedCategories: string[]
43
  excludedCategoryReasons?: Record<string, string>
44
  categoryScores: Record<string, CategoryScore>
 
45
  currentCategory: string | null
46
  }
47
 
@@ -103,7 +103,7 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
103
  console.log("[v0] Updated categoryScores:", newCategoryScores)
104
  return {
105
  ...prev,
106
- categoryScores: newCategoryScores,
107
  }
108
  })
109
 
@@ -126,6 +126,17 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
126
  }
127
  }
128
 
 
 
 
 
 
 
 
 
 
 
 
129
  const handleSaveEvaluation = async () => {
130
  console.log("[v0] handleSaveEvaluation called")
131
  console.log("[v0] evaluationData:", evaluationData)
@@ -142,14 +153,14 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
142
  console.log("[v0] Processing category scores:", evaluationData.categoryScores)
143
 
144
  const capabilityCategories = evaluationData.selectedCategories.filter((cat) => {
145
- const category = CATEGORIES.find((c) => c.id === cat)
146
  console.log("[v0] Category check:", cat, "type:", category?.type)
147
  return category?.type === "capability"
148
  })
149
  console.log("[v0] Capability categories:", capabilityCategories)
150
 
151
  const riskCategories = evaluationData.selectedCategories.filter((cat) => {
152
- const category = CATEGORIES.find((c) => c.id === cat)
153
  return category?.type === "risk"
154
  })
155
  console.log("[v0] Risk categories:", riskCategories)
@@ -184,11 +195,12 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
184
  version: evaluationData.systemInfo.url || "1.0",
185
  deploymentContext: evaluationData.systemInfo.deploymentContexts.join(", ") || "Production",
186
  evaluator: "Current User",
187
- modality: evaluationData.systemInfo.modality,
 
188
  evaluationDate: new Date().toISOString().split("T")[0],
189
  selectedCategories: evaluationData.selectedCategories,
190
  excludedCategoryReasons: evaluationData.excludedCategoryReasons || {},
191
- categoryEvaluations: evaluationData.categoryScores,
192
  overallStats: {
193
  completenessScore: 85, // Safe default value
194
  totalApplicable: evaluationData.selectedCategories.length,
@@ -233,7 +245,7 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
233
  case "categories":
234
  return (
235
  <CategorySelection
236
- categories={CATEGORIES}
237
  selectedCategories={evaluationData.selectedCategories}
238
  onSelectionChange={handleCategoriesSelectedWithReasons}
239
  />
@@ -241,10 +253,11 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
241
  case "evaluation":
242
  return (
243
  <EvaluationForm
244
- categories={CATEGORIES}
245
  selectedCategories={evaluationData.selectedCategories}
246
  categoryScores={evaluationData.categoryScores}
247
  onScoreUpdate={(categoryId, score) => handleCategoryComplete(categoryId, score)}
 
248
  onComplete={() => setCurrentStep("results")}
249
  />
250
  )
@@ -252,7 +265,7 @@ export function AIEvaluationDashboard({ onBack, onSaveEvaluation }: AIEvaluation
252
  return (
253
  <ResultsDashboard
254
  systemInfo={evaluationData.systemInfo}
255
- categories={CATEGORIES}
256
  selectedCategories={evaluationData.selectedCategories}
257
  categoryScores={evaluationData.categoryScores}
258
  excludedCategoryReasons={evaluationData.excludedCategoryReasons || {}}
 
10
  import { CategoryEvaluation } from "./category-evaluation"
11
  import { EvaluationForm } from "./evaluation-form"
12
  import { ResultsDashboard } from "./results-dashboard"
13
+ import { getAllCategories, getCategoryById } from "@/lib/schema"
14
 
15
  export type SystemInfo = {
16
  name: string
17
  url: string
18
  provider: string
19
+ version: string
20
  deploymentContexts: string[]
 
21
  modelTag?: string
22
  knowledgeCutoff?: string
23
  modelType?: "foundational" | "fine-tuned" | "na"
 
41
  selectedCategories: string[]
42
  excludedCategoryReasons?: Record<string, string>
43
  categoryScores: Record<string, CategoryScore>
44
+ categoryEvaluationsDetailed?: Record<string, any>
45
  currentCategory: string | null
46
  }
47
 
 
103
  console.log("[v0] Updated categoryScores:", newCategoryScores)
104
  return {
105
  ...prev,
106
+ categoryScores: newCategoryScores,
107
  }
108
  })
109
 
 
126
  }
127
  }
128
 
129
+ const handleSaveDetailed = (categoryId: string, data: any) => {
130
+ console.log('[v0] handleSaveDetailed called for', categoryId, data)
131
+ setEvaluationData((prev) => ({
132
+ ...prev,
133
+ categoryEvaluationsDetailed: {
134
+ ...(prev.categoryEvaluationsDetailed || {}),
135
+ [categoryId]: data,
136
+ },
137
+ }))
138
+ }
139
+
140
  const handleSaveEvaluation = async () => {
141
  console.log("[v0] handleSaveEvaluation called")
142
  console.log("[v0] evaluationData:", evaluationData)
 
153
  console.log("[v0] Processing category scores:", evaluationData.categoryScores)
154
 
155
  const capabilityCategories = evaluationData.selectedCategories.filter((cat) => {
156
+ const category = getCategoryById(cat)
157
  console.log("[v0] Category check:", cat, "type:", category?.type)
158
  return category?.type === "capability"
159
  })
160
  console.log("[v0] Capability categories:", capabilityCategories)
161
 
162
  const riskCategories = evaluationData.selectedCategories.filter((cat) => {
163
+ const category = getCategoryById(cat)
164
  return category?.type === "risk"
165
  })
166
  console.log("[v0] Risk categories:", riskCategories)
 
195
  version: evaluationData.systemInfo.url || "1.0",
196
  deploymentContext: evaluationData.systemInfo.deploymentContexts.join(", ") || "Production",
197
  evaluator: "Current User",
198
+ inputModalities: evaluationData.systemInfo.inputModalities || ["Text"],
199
+ outputModalities: evaluationData.systemInfo.outputModalities || ["Text"],
200
  evaluationDate: new Date().toISOString().split("T")[0],
201
  selectedCategories: evaluationData.selectedCategories,
202
  excludedCategoryReasons: evaluationData.excludedCategoryReasons || {},
203
+ categoryEvaluations: evaluationData.categoryEvaluationsDetailed || evaluationData.categoryScores,
204
  overallStats: {
205
  completenessScore: 85, // Safe default value
206
  totalApplicable: evaluationData.selectedCategories.length,
 
245
  case "categories":
246
  return (
247
  <CategorySelection
248
+ categories={getAllCategories()}
249
  selectedCategories={evaluationData.selectedCategories}
250
  onSelectionChange={handleCategoriesSelectedWithReasons}
251
  />
 
253
  case "evaluation":
254
  return (
255
  <EvaluationForm
256
+ categories={getAllCategories()}
257
  selectedCategories={evaluationData.selectedCategories}
258
  categoryScores={evaluationData.categoryScores}
259
  onScoreUpdate={(categoryId, score) => handleCategoryComplete(categoryId, score)}
260
+ onSaveDetailed={(catId, data) => handleSaveDetailed(catId, data)}
261
  onComplete={() => setCurrentStep("results")}
262
  />
263
  )
 
265
  return (
266
  <ResultsDashboard
267
  systemInfo={evaluationData.systemInfo}
268
+ categories={getAllCategories()}
269
  selectedCategories={evaluationData.selectedCategories}
270
  categoryScores={evaluationData.categoryScores}
271
  excludedCategoryReasons={evaluationData.excludedCategoryReasons || {}}
components/category-evaluation.tsx CHANGED
@@ -12,151 +12,15 @@ import { Separator } from "@/components/ui/separator"
12
  import type { CategoryScore } from "@/components/ai-evaluation-dashboard"
13
  import { HelpCircle, CheckCircle, Plus, Trash2 } from "lucide-react"
14
  import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"
15
- import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS, SOURCE_TYPES, ADDITIONAL_ASPECTS_SECTION, getFieldPlaceholder, getHint } from "@/lib/category-data"
 
 
16
 
17
  // The detailed per-category and per-question hints, plus recommended placeholders,
18
  // are centralized in `lib/category-data.ts`. This component uses the exported
19
  // helpers `getHint` and `getFieldPlaceholder` and the question lists.
20
 
21
- const CustomFieldComponent = ({
22
- questionId,
23
- fieldType,
24
- value,
25
- onChange,
26
- }: {
27
- questionId: string
28
- fieldType: string
29
- value: string
30
- onChange: (value: string) => void
31
- }) => {
32
- const getFieldConfig = (questionId: string, fieldType: string) => {
33
- const configs: Record<string, Record<string, { label: string; placeholder: string; type?: string }>> = {
34
- A2: {
35
- thresholds: { label: "Quantitative Thresholds", placeholder: "e.g., >85% accuracy, <0.1 error rate" },
36
- thresholdSource: {
37
- label: "Threshold Source",
38
- placeholder: "e.g., industry standard, research paper, policy requirement",
39
- },
40
- passFail: { label: "Pass/Fail Determination", placeholder: "e.g., Pass - exceeded 85% threshold" },
41
- },
42
- A3: {
43
- comparativeScores: {
44
- label: "Comparative Scores",
45
- placeholder: "e.g., Our model: 87.2%, GPT-4: 85.1%, Previous version: 82.3%",
46
- },
47
- baselineType: { label: "Baseline Type", placeholder: "e.g., SOTA, previous version, industry standard" },
48
- significance: { label: "Statistical Significance", placeholder: "e.g., p<0.05, 95% CI: [1.2, 3.8]" },
49
- },
50
- A4: {
51
- testTypes: { label: "Test Types", placeholder: "e.g., adversarial attacks, load testing, distribution shift" },
52
- failureRates: { label: "Failure/Degradation Rates", placeholder: "e.g., 15% failure under adversarial inputs" },
53
- robustnessMetrics: {
54
- label: "Robustness Metrics",
55
- placeholder: "e.g., attack success rate, performance drop %",
56
- },
57
- },
58
- A5: {
59
- liveMetrics: { label: "Live Metrics Tracked", placeholder: "e.g., error rates, latency, drift detection" },
60
- samplingCadence: { label: "Sampling Cadence", placeholder: "e.g., every 1000 requests, hourly, daily" },
61
- alertThresholds: { label: "Alert Thresholds", placeholder: "e.g., >5% error rate, >500ms latency" },
62
- },
63
- A6: {
64
- procedure: {
65
- label: "Contamination Check Procedure",
66
- placeholder: "e.g., n-gram overlap analysis, URL deduplication",
67
- },
68
- contaminationRate: {
69
- label: "Contamination Rate",
70
- placeholder: "e.g., <1% overlap detected, 0.3% exact matches",
71
- },
72
- mitigations: { label: "Mitigations Taken", placeholder: "e.g., removed overlapping samples, used holdout set" },
73
- },
74
- A7: {
75
- comparisonSystems: { label: "Comparison Systems", placeholder: "e.g., GPT-4, Claude-3, Gemini Pro" },
76
- evaluationConditions: {
77
- label: "Evaluation Conditions",
78
- placeholder: "e.g., same prompts, temperature=0, identical hardware",
79
- },
80
- relativeMetrics: {
81
- label: "Relative Performance Metrics",
82
- placeholder: "e.g., 15% better accuracy, 2x faster inference",
83
- },
84
- },
85
- B1: {
86
- scope: {
87
- label: "Evaluation Scope",
88
- placeholder: "e.g., measures reasoning capability in mathematical contexts",
89
- },
90
- successFailureDefinitions: {
91
- label: "Success/Failure Definitions",
92
- placeholder: "e.g., success = >80% on grade-level problems",
93
- },
94
- hypotheses: { label: "Hypotheses Being Tested", placeholder: "e.g., model can solve multi-step word problems" },
95
- },
96
- B2: {
97
- replicationPackage: {
98
- label: "Replication Package",
99
- placeholder: "e.g., GitHub repo with code, configs, prompts",
100
- },
101
- accessLevel: { label: "Access Level", placeholder: "e.g., public, access-controlled, internal only" },
102
- proxies: { label: "Proxies (if not shareable)", placeholder: "e.g., synthetic examples, anonymized data" },
103
- },
104
- B5: {
105
- reviewers: { label: "Reviewers", placeholder: "e.g., domain experts, affected user groups, ethics board" },
106
- feedbackChanges: {
107
- label: "Changes from Feedback",
108
- placeholder: "e.g., added bias metrics, revised interpretation",
109
- },
110
- disagreements: {
111
- label: "Unresolved Disagreements",
112
- placeholder: "e.g., threshold levels, risk severity ratings",
113
- },
114
- },
115
- B6: {
116
- uncertaintyDisclosure: {
117
- label: "Uncertainty Disclosure",
118
- placeholder: "e.g., error bars, confidence intervals, variance across runs",
119
- },
120
- axesConsistency: { label: "Axes Consistency", placeholder: "e.g., consistent 0-100 scale, no truncated axes" },
121
- sampleSizes: { label: "Sample Sizes", placeholder: "e.g., n=1000 test samples, 5 random seeds" },
122
- selectionCriteria: { label: "Selection Criteria", placeholder: "e.g., all results shown, no cherry-picking" },
123
- },
124
- B8: {
125
- triggers: {
126
- label: "Re-evaluation Triggers",
127
- placeholder: "e.g., model updates, data drift >5%, security incidents",
128
- },
129
- versionedSpecs: { label: "Versioned Eval Specs", placeholder: "e.g., eval spec v2.1, change log maintained" },
130
- auditTrail: { label: "Audit Trail", placeholder: "e.g., all changes logged with timestamps and rationale" },
131
- mitigationProtocols: {
132
- label: "Mitigation Protocols",
133
- placeholder: "e.g., automated rollback, manual review process",
134
- },
135
- retestProcedures: {
136
- label: "Retest Procedures",
137
- placeholder: "e.g., full eval suite after fixes, regression testing",
138
- },
139
- },
140
- }
141
-
142
- return configs[questionId]?.[fieldType] || { label: fieldType, placeholder: "" }
143
- }
144
-
145
- const config = getFieldConfig(questionId, fieldType)
146
-
147
- return (
148
- <div>
149
- <Label className="text-xs font-medium">{config.label}</Label>
150
- <Textarea
151
- placeholder={config.placeholder}
152
- value={value}
153
- onChange={(e) => onChange(e.target.value)}
154
- rows={2}
155
- className="mt-1"
156
- />
157
- </div>
158
- )
159
- }
160
 
161
  // Local types used by this component (kept minimal for readability)
162
  export type Source = {
@@ -190,9 +54,10 @@ export type CategoryEvaluationProps = {
190
  category: { id: string; name: string; description: string; type: string; detailedGuidance?: string }
191
  score?: CategoryScore | null
192
  onScoreUpdate: (score: CategoryScore) => void
 
193
  }
194
 
195
- export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryEvaluationProps) {
196
  const [benchmarkAnswers, setBenchmarkAnswers] = useState<Record<string, string>>({})
197
  const [processAnswers, setProcessAnswers] = useState<Record<string, string>>({})
198
  const [benchmarkSources, setBenchmarkSources] = useState<Record<string, Source[]>>({})
@@ -209,8 +74,11 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
209
 
210
  const addSource = (questionId: string, section: "benchmark" | "process") => {
211
  if (section === "benchmark") {
 
 
 
212
  const newSource: Source = {
213
- id: Date.now().toString(),
214
  url: "",
215
  description: "",
216
  sourceType: "internal",
@@ -227,8 +95,11 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
227
  [questionId]: [...(prev[questionId] || []), newSource],
228
  }))
229
  } else {
 
 
 
230
  const newDocSource: DocumentationSource = {
231
- id: Date.now().toString(),
232
  url: "",
233
  description: "",
234
  sourceType: "internal",
@@ -326,8 +197,8 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
326
 
327
  const currentScore = useMemo(() => {
328
  // Calculate counts
329
- const totalBenchmarkQuestions = BENCHMARK_QUESTIONS.length
330
- const totalProcessQuestions = PROCESS_QUESTIONS.length
331
  const totalQuestions = totalBenchmarkQuestions + totalProcessQuestions
332
 
333
  const benchmarkYesCount = Object.values(benchmarkAnswers).filter((answer) => answer === "yes").length
@@ -410,12 +281,22 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
410
  }
411
 
412
  console.log("[v0] Saving category evaluation")
 
 
 
 
 
 
 
 
 
 
413
  console.log("[v0] Calling onScoreUpdate with:", currentScore)
414
  onScoreUpdate(currentScore)
415
  }
416
 
417
  const isComplete =
418
- Object.keys(benchmarkAnswers).length + Object.keys(processAnswers).length === BENCHMARK_QUESTIONS.length + PROCESS_QUESTIONS.length
419
 
420
  return (
421
  <TooltipProvider>
@@ -485,7 +366,7 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
485
  </CardDescription>
486
  </CardHeader>
487
  <CardContent className="space-y-6">
488
- {BENCHMARK_QUESTIONS.map((question) => (
489
  <div key={question.id} className="space-y-3">
490
  <div className="flex items-start gap-2">
491
  <Label className="text-sm font-medium flex-1">
@@ -568,114 +449,45 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
568
  <div className="grid gap-3">
569
  {/* no structured hint here; description has contextual hints */}
570
 
571
- <div>
572
- <Label className="text-xs">Benchmark/Dataset Name</Label>
573
- <Input
574
- placeholder={getFieldPlaceholder(category.id, question.id, "benchmarkName")}
575
- value={source.benchmarkName || ""}
576
- onChange={(e) =>
577
- updateSource(question.id, source.id, "benchmarkName", e.target.value, "benchmark")
578
- }
579
- />
580
- </div>
581
-
582
- <div className="grid grid-cols-2 gap-3">
583
- <div>
584
- <Label className="text-xs">Version</Label>
585
- <Input
586
- placeholder="e.g., v1.2, 2024-01"
587
- value={source.version || ""}
588
- onChange={(e) =>
589
- updateSource(question.id, source.id, "version", e.target.value, "benchmark")
590
- }
591
- />
592
- </div>
593
- <div>
594
- <Label className="text-xs">Task Variants</Label>
595
- <Input
596
- placeholder="e.g., multiple choice, generation"
597
- value={source.taskVariants || ""}
598
- onChange={(e) =>
599
- updateSource(question.id, source.id, "taskVariants", e.target.value, "benchmark")
600
- }
601
- />
602
- </div>
603
- </div>
604
-
605
- <div>
606
- <Label className="text-xs">Metrics</Label>
607
- <Input
608
- placeholder={getFieldPlaceholder(category.id, question.id, "metrics")}
609
- value={source.metrics || ""}
610
- onChange={(e) =>
611
- updateSource(question.id, source.id, "metrics", e.target.value, "benchmark")
612
- }
613
- />
614
- </div>
615
-
616
- <div>
617
- <Label className="text-xs">URL</Label>
618
- <Input
619
- placeholder="https://..."
620
- value={source.url}
621
- onChange={(e) => updateSource(question.id, source.id, "url", e.target.value, "benchmark")}
622
- />
623
- </div>
624
-
625
- <div>
626
- <Label className="text-xs">Description</Label>
627
- <Textarea
628
- placeholder="Describe the benchmark, test, or evaluation method..."
629
- value={source.description}
630
- onChange={(e) =>
631
- updateSource(question.id, source.id, "description", e.target.value, "benchmark")
632
- }
633
- rows={2}
634
- />
635
- <p className="text-xs text-muted-foreground mt-1">
636
- {getHint(category.id, question.id, "benchmark")}
637
- </p>
638
- </div>
639
-
640
- <div className="grid grid-cols-2 gap-3">
641
- <div>
642
- <Label className="text-xs">Source Type</Label>
643
- <RadioGroup
644
- value={source.sourceType}
645
- onValueChange={(value) =>
646
- updateSource(question.id, source.id, "sourceType", value, "benchmark")
647
- }
648
- >
649
- {Object.entries(SOURCE_TYPES).map(([key, type]) => (
650
- <div key={key} className="flex items-center space-x-2">
651
- <RadioGroupItem value={key} id={`${source.id}-${key}`} />
652
- <Label htmlFor={`${source.id}-${key}`} className="text-xs">
653
- {type.label}
654
- </Label>
655
  </div>
656
- ))}
657
- </RadioGroup>
658
- </div>
659
-
660
- <div>
661
- <Label className="text-xs">Score (if applicable)</Label>
662
- <Input
663
- placeholder="e.g., 85%, 0.92, Pass"
664
- value={source.score || ""}
665
- onChange={(e) =>
666
- updateSource(question.id, source.id, "score", e.target.value, "benchmark")
667
- }
668
- />
669
- <Label className="text-xs mt-2">Confidence Interval (optional)</Label>
670
- <Input
671
- placeholder="e.g., 95% CI [90,94]"
672
- value={(source as any).confidenceInterval || ""}
673
- onChange={(e) =>
674
- updateSource(question.id, source.id, "confidenceInterval", e.target.value, "benchmark")
675
- }
676
- />
677
  </div>
678
- </div>
679
  </div>
680
  </div>
681
  ))}
@@ -702,7 +514,7 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
702
  </CardDescription>
703
  </CardHeader>
704
  <CardContent className="space-y-6">
705
- {PROCESS_QUESTIONS.map((question) => (
706
  <div key={question.id} className="space-y-3">
707
  <div className="flex items-start gap-2">
708
  <Label className="text-sm font-medium flex-1">
@@ -785,99 +597,29 @@ export function CategoryEvaluation({ category, score, onScoreUpdate }: CategoryE
785
  <div className="grid gap-3">
786
  {/* no structured hint here; description has contextual hints */}
787
 
788
- <div>
789
- <Label className="text-xs">URL</Label>
790
- <Input
791
- placeholder="https://..."
792
- value={source.url}
793
- onChange={(e) => updateSource(question.id, source.id, "url", e.target.value, "process")}
794
- />
795
- </div>
796
-
797
- <div>
798
- <Label className="text-xs">Description</Label>
799
- <Textarea
800
- placeholder="Describe the documentation, policy, or process..."
801
- value={source.description}
802
- onChange={(e) =>
803
- updateSource(question.id, source.id, "description", e.target.value, "process")
804
- }
805
- rows={2}
806
- />
807
- <p className="text-xs text-muted-foreground mt-1">
808
- {getHint(category.id, question.id, "process")}
809
- </p>
810
- </div>
811
-
812
- <div className="grid grid-cols-2 gap-3">
813
- <div>
814
- <Label className="text-xs">Source Type</Label>
815
- <RadioGroup
816
- value={source.sourceType}
817
- onValueChange={(value) =>
818
- updateSource(question.id, source.id, "sourceType", value, "process")
819
- }
820
- >
821
- {Object.entries(SOURCE_TYPES).map(([key, type]) => (
822
- <div key={key} className="flex items-center space-x-2">
823
- <RadioGroupItem value={key} id={`${source.id}-${key}`} />
824
- <Label htmlFor={`${source.id}-${key}`} className="text-xs">
825
- {type.label}
826
- </Label>
827
- </div>
828
- ))}
829
- </RadioGroup>
830
- </div>
831
-
832
- <div>
833
- <Label className="text-xs">Document Type</Label>
834
- <Input
835
- placeholder="e.g., Policy, Procedure, Report"
836
- value={source.documentType || ""}
837
- onChange={(e) =>
838
- updateSource(question.id, source.id, "documentType", e.target.value, "process")
839
- }
840
- />
841
- </div>
842
- </div>
843
-
844
- <div className="grid grid-cols-2 gap-3 mt-2">
845
- <div>
846
- <Label className="text-xs">Title</Label>
847
- <Input
848
- placeholder="Document title"
849
- value={(source as any).title || ""}
850
- onChange={(e) => updateSource(question.id, source.id, "title", e.target.value, "process")}
851
- />
852
- </div>
853
- <div>
854
- <Label className="text-xs">Author</Label>
855
- <Input
856
- placeholder="Author or owner"
857
- value={(source as any).author || ""}
858
- onChange={(e) => updateSource(question.id, source.id, "author", e.target.value, "process")}
859
- />
860
- </div>
861
- </div>
862
-
863
- <div className="grid grid-cols-2 gap-3 mt-2">
864
- <div>
865
- <Label className="text-xs">Organization</Label>
866
- <Input
867
- placeholder="Owning org"
868
- value={(source as any).organization || ""}
869
- onChange={(e) => updateSource(question.id, source.id, "organization", e.target.value, "process")}
870
- />
871
- </div>
872
- <div>
873
- <Label className="text-xs">Date</Label>
874
- <Input
875
- placeholder="YYYY-MM-DD"
876
- value={(source as any).date || ""}
877
- onChange={(e) => updateSource(question.id, source.id, "date", e.target.value, "process")}
878
- />
879
  </div>
880
- </div>
881
 
882
 
883
  </div>
 
12
  import type { CategoryScore } from "@/components/ai-evaluation-dashboard"
13
  import { HelpCircle, CheckCircle, Plus, Trash2 } from "lucide-react"
14
  import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"
15
+ import { SOURCE_TYPES, ADDITIONAL_ASPECTS_SECTION, getFieldPlaceholder, getHint } from "@/lib/schema"
16
+ import { getBenchmarkQuestions, getProcessQuestions } from '@/lib/schema'
17
+ import formSchema from '@/schema/evaluation-schema.json'
18
 
19
  // The detailed per-category and per-question hints, plus recommended placeholders,
20
  // are centralized in `lib/category-data.ts`. This component uses the exported
21
  // helpers `getHint` and `getFieldPlaceholder` and the question lists.
22
 
23
+ // All benchmark questions share the same input fields; all process questions share the same input fields.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  // Local types used by this component (kept minimal for readability)
26
  export type Source = {
 
54
  category: { id: string; name: string; description: string; type: string; detailedGuidance?: string }
55
  score?: CategoryScore | null
56
  onScoreUpdate: (score: CategoryScore) => void
57
+ onSaveDetailed?: (categoryId: string, data: any) => void
58
  }
59
 
60
+ export function CategoryEvaluation({ category, score, onScoreUpdate, onSaveDetailed }: CategoryEvaluationProps) {
61
  const [benchmarkAnswers, setBenchmarkAnswers] = useState<Record<string, string>>({})
62
  const [processAnswers, setProcessAnswers] = useState<Record<string, string>>({})
63
  const [benchmarkSources, setBenchmarkSources] = useState<Record<string, Source[]>>({})
 
74
 
75
  const addSource = (questionId: string, section: "benchmark" | "process") => {
76
  if (section === "benchmark") {
77
+ const newId = (globalThis.crypto && (globalThis.crypto as any).randomUUID)
78
+ ? (globalThis.crypto as any).randomUUID()
79
+ : Date.now().toString()
80
  const newSource: Source = {
81
+ id: newId,
82
  url: "",
83
  description: "",
84
  sourceType: "internal",
 
95
  [questionId]: [...(prev[questionId] || []), newSource],
96
  }))
97
  } else {
98
+ const newId = (globalThis.crypto && (globalThis.crypto as any).randomUUID)
99
+ ? (globalThis.crypto as any).randomUUID()
100
+ : Date.now().toString()
101
  const newDocSource: DocumentationSource = {
102
+ id: newId,
103
  url: "",
104
  description: "",
105
  sourceType: "internal",
 
197
 
198
  const currentScore = useMemo(() => {
199
  // Calculate counts
200
+ const totalBenchmarkQuestions = getBenchmarkQuestions().length
201
+ const totalProcessQuestions = getProcessQuestions().length
202
  const totalQuestions = totalBenchmarkQuestions + totalProcessQuestions
203
 
204
  const benchmarkYesCount = Object.values(benchmarkAnswers).filter((answer) => answer === "yes").length
 
281
  }
282
 
283
  console.log("[v0] Saving category evaluation")
284
+ const detailed = {
285
+ benchmarkAnswers,
286
+ processAnswers,
287
+ benchmarkSources,
288
+ processSources,
289
+ additionalAspects,
290
+ score: currentScore,
291
+ }
292
+ console.log("[v0] Calling onSaveDetailed with:", detailed)
293
+ onSaveDetailed?.(category.id, detailed)
294
  console.log("[v0] Calling onScoreUpdate with:", currentScore)
295
  onScoreUpdate(currentScore)
296
  }
297
 
298
  const isComplete =
299
+ Object.keys(benchmarkAnswers).length + Object.keys(processAnswers).length === getBenchmarkQuestions().length + getProcessQuestions().length
300
 
301
  return (
302
  <TooltipProvider>
 
366
  </CardDescription>
367
  </CardHeader>
368
  <CardContent className="space-y-6">
369
+ {getBenchmarkQuestions().map((question) => (
370
  <div key={question.id} className="space-y-3">
371
  <div className="flex items-start gap-2">
372
  <Label className="text-sm font-medium flex-1">
 
449
  <div className="grid gap-3">
450
  {/* no structured hint here; description has contextual hints */}
451
 
452
+ {/* Render benchmark source fields from form-schema.json to keep fields uniform */}
453
+ {formSchema.benchmarkSourceFields.map((field: any) => (
454
+ <div key={field.name}>
455
+ <Label className="text-xs">{field.label}</Label>
456
+ {field.type === "textarea" ? (
457
+ <Textarea
458
+ placeholder={field.placeholder || ""}
459
+ value={(source as any)[field.name] || ""}
460
+ onChange={(e) => updateSource(question.id, source.id, field.name, e.target.value, "benchmark")}
461
+ rows={field.rows || 2}
462
+ />
463
+ ) : field.type === "radio" ? (
464
+ <RadioGroup
465
+ value={(source as any)[field.name] || "internal"}
466
+ onValueChange={(value) => updateSource(question.id, source.id, field.name, value, "benchmark")}
467
+ >
468
+ <div className="flex flex-col gap-2">
469
+ {field.options.map((opt: any) => (
470
+ <div key={opt.value} className="flex items-center space-x-2">
471
+ <RadioGroupItem value={opt.value} id={`${source.id}-${field.name}-${opt.value}`} />
472
+ <Label htmlFor={`${source.id}-${field.name}-${opt.value}`} className="text-xs">
473
+ {opt.label}
474
+ </Label>
475
+ </div>
476
+ ))}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  </div>
478
+ </RadioGroup>
479
+ ) : (
480
+ <Input
481
+ placeholder={field.placeholder || ""}
482
+ value={(source as any)[field.name] || ""}
483
+ onChange={(e) => updateSource(question.id, source.id, field.name, e.target.value, "benchmark")}
484
+ />
485
+ )}
486
+ {field.name === "description" && (
487
+ <p className="text-xs text-muted-foreground mt-1">{getHint(category.id, question.id, "benchmark")}</p>
488
+ )}
 
 
 
 
 
 
 
 
 
 
489
  </div>
490
+ ))}
491
  </div>
492
  </div>
493
  ))}
 
514
  </CardDescription>
515
  </CardHeader>
516
  <CardContent className="space-y-6">
517
+ {getProcessQuestions().map((question) => (
518
  <div key={question.id} className="space-y-3">
519
  <div className="flex items-start gap-2">
520
  <Label className="text-sm font-medium flex-1">
 
597
  <div className="grid gap-3">
598
  {/* no structured hint here; description has contextual hints */}
599
 
600
+ {/* Render process source fields from form-schema.json */}
601
+ {formSchema.processSourceFields.map((field: any) => (
602
+ <div key={field.name}>
603
+ <Label className="text-xs">{field.label}</Label>
604
+ {field.type === "textarea" ? (
605
+ <Textarea
606
+ placeholder={field.placeholder || ""}
607
+ value={(source as any)[field.name] || ""}
608
+ onChange={(e) => updateSource(question.id, source.id, field.name, e.target.value, "process")}
609
+ rows={field.rows || 2}
610
+ />
611
+ ) : (
612
+ <Input
613
+ placeholder={field.placeholder || ""}
614
+ value={(source as any)[field.name] || ""}
615
+ onChange={(e) => updateSource(question.id, source.id, field.name, e.target.value, "process")}
616
+ />
617
+ )}
618
+ {field.name === "description" && (
619
+ <p className="text-xs text-muted-foreground mt-1">{getHint(category.id, question.id, "process")}</p>
620
+ )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  </div>
622
+ ))}
623
 
624
 
625
  </div>
components/evaluation-card.tsx CHANGED
@@ -13,7 +13,8 @@ export type EvaluationCardData = {
13
  id: string
14
  systemName: string
15
  provider: string
16
- modality: string // Added modality field
 
17
  completedDate: string
18
  applicableCategories: number
19
  completedCategories: number
@@ -68,13 +69,42 @@ export function EvaluationCard({ evaluation, onView, onDelete }: EvaluationCardP
68
  const toggleArea = (area: string) => setExpandedAreas((p) => ({ ...p, [area]: !p[area] }))
69
  const router = useRouter()
70
  const modalityMap: Record<string, { label: string; emoji?: string; variant?: string }> = {
71
- "text-to-text": { label: "Text → Text", emoji: "📝" },
72
- "text-to-image": { label: "Text → Image", emoji: "🖼️" },
73
- multimodal: { label: "Multimodal", emoji: "🤖" },
74
- "speech-to-text": { label: "Speech → Text", emoji: "🗣️" },
75
- "speech-to-speech": { label: "Speech → Speech", emoji: "🔊" },
76
- "image-to-text": { label: "Image → Text", emoji: "📷" },
77
- code: { label: "Code", emoji: "💻" },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  }
79
  const getUniqueCount = (lists: string[][]) => {
80
  const set = new Set<string>()
@@ -147,14 +177,24 @@ export function EvaluationCard({ evaluation, onView, onDelete }: EvaluationCardP
147
  <div className="space-y-1 flex-1 min-w-0">
148
  <CardTitle className="text-lg font-heading truncate">{evaluation.systemName}</CardTitle>
149
  <p className="text-sm text-muted-foreground truncate">{evaluation.provider}</p>
150
- {/* Pretty modality badge with emoji */}
151
  {(() => {
152
- const info = modalityMap[evaluation.modality] || { label: evaluation.modality }
153
  return (
154
- <Badge variant={info.variant as any || "secondary"} className="text-xs px-2 py-1 w-fit flex items-center gap-2">
155
- {info.emoji ? <span aria-hidden>{info.emoji}</span> : null}
156
- <span className="whitespace-nowrap">{info.label}</span>
157
- </Badge>
 
 
 
 
 
 
 
 
 
 
158
  )
159
  })()}
160
  </div>
 
13
  id: string
14
  systemName: string
15
  provider: string
16
+ inputModalities: string[]
17
+ outputModalities: string[]
18
  completedDate: string
19
  applicableCategories: number
20
  completedCategories: number
 
69
  const toggleArea = (area: string) => setExpandedAreas((p) => ({ ...p, [area]: !p[area] }))
70
  const router = useRouter()
71
  const modalityMap: Record<string, { label: string; emoji?: string; variant?: string }> = {
72
+ "Text": { label: "Text", emoji: "📝" },
73
+ "Image": { label: "Image", emoji: "🖼️" },
74
+ "Audio": { label: "Audio", emoji: "🔊" },
75
+ "Video": { label: "Video", emoji: "🎥" },
76
+ "Tabular": { label: "Tabular", emoji: "📊" },
77
+ "Robotics/Action": { label: "Robotics", emoji: "🤖" },
78
+ "Other": { label: "Other", emoji: "" },
79
+ }
80
+
81
+ const getModalityDisplay = (inputModalities: string[], outputModalities: string[]) => {
82
+ const inputStr = inputModalities.join(", ")
83
+ const outputStr = outputModalities.join(", ")
84
+
85
+ // Special cases for common patterns
86
+ if (inputModalities.length === 1 && outputModalities.length === 1) {
87
+ if (inputModalities[0] === "Text" && outputModalities[0] === "Text") {
88
+ return { label: "Text → Text", emoji: "�" }
89
+ }
90
+ if (inputModalities[0] === "Text" && outputModalities[0] === "Image") {
91
+ return { label: "Text → Image", emoji: "�️" }
92
+ }
93
+ if (inputModalities[0] === "Image" && outputModalities[0] === "Text") {
94
+ return { label: "Image → Text", emoji: "📷" }
95
+ }
96
+ if (inputModalities[0] === "Tabular" && outputModalities[0] === "Tabular") {
97
+ return { label: "Tabular", emoji: "📊" }
98
+ }
99
+ }
100
+
101
+ // Multimodal cases
102
+ if (inputModalities.length > 1 || outputModalities.length > 1) {
103
+ return { label: "Multimodal", emoji: "🤖" }
104
+ }
105
+
106
+ // Fallback
107
+ return { label: `${inputStr} → ${outputStr}`, emoji: "⚡" }
108
  }
109
  const getUniqueCount = (lists: string[][]) => {
110
  const set = new Set<string>()
 
177
  <div className="space-y-1 flex-1 min-w-0">
178
  <CardTitle className="text-lg font-heading truncate">{evaluation.systemName}</CardTitle>
179
  <p className="text-sm text-muted-foreground truncate">{evaluation.provider}</p>
180
+ {/* Enhanced modality badge with emoji and hover detail */}
181
  {(() => {
182
+ const info = getModalityDisplay(evaluation.inputModalities, evaluation.outputModalities)
183
  return (
184
+ <Tooltip>
185
+ <TooltipTrigger asChild>
186
+ <Badge variant="secondary" className="text-xs px-2 py-1 w-fit flex items-center gap-2 cursor-help">
187
+ {info.emoji ? <span aria-hidden className="text-sm">{info.emoji}</span> : null}
188
+ <span className="whitespace-nowrap">{info.label}</span>
189
+ </Badge>
190
+ </TooltipTrigger>
191
+ <TooltipContent side="bottom" className="max-w-xs">
192
+ <div className="text-sm">
193
+ <div><strong>Input:</strong> {evaluation.inputModalities.join(", ")}</div>
194
+ <div><strong>Output:</strong> {evaluation.outputModalities.join(", ")}</div>
195
+ </div>
196
+ </TooltipContent>
197
+ </Tooltip>
198
  )
199
  })()}
200
  </div>
components/evaluation-form.tsx CHANGED
@@ -15,7 +15,7 @@ interface Category {
15
  name: string
16
  type: "capability" | "risk"
17
  description: string
18
- detailedGuidance: string
19
  }
20
 
21
  interface EvaluationFormProps {
@@ -23,6 +23,7 @@ interface EvaluationFormProps {
23
  selectedCategories: string[]
24
  categoryScores: Record<string, CategoryScore>
25
  onScoreUpdate: (categoryId: string, score: CategoryScore) => void
 
26
  onComplete: () => void
27
  }
28
 
@@ -31,6 +32,7 @@ export function EvaluationForm({
31
  selectedCategories,
32
  categoryScores,
33
  onScoreUpdate,
 
34
  onComplete,
35
  }: EvaluationFormProps) {
36
  const [currentCategoryIndex, setCurrentCategoryIndex] = useState(0)
@@ -150,6 +152,7 @@ export function EvaluationForm({
150
  category={currentCategory}
151
  score={categoryScores[currentCategory.id]}
152
  onScoreUpdate={(score) => onScoreUpdate(currentCategory.id, score)}
 
153
  />
154
  )}
155
 
 
15
  name: string
16
  type: "capability" | "risk"
17
  description: string
18
+ detailedGuidance?: string
19
  }
20
 
21
  interface EvaluationFormProps {
 
23
  selectedCategories: string[]
24
  categoryScores: Record<string, CategoryScore>
25
  onScoreUpdate: (categoryId: string, score: CategoryScore) => void
26
+ onSaveDetailed?: (categoryId: string, data: any) => void
27
  onComplete: () => void
28
  }
29
 
 
32
  selectedCategories,
33
  categoryScores,
34
  onScoreUpdate,
35
+ onSaveDetailed,
36
  onComplete,
37
  }: EvaluationFormProps) {
38
  const [currentCategoryIndex, setCurrentCategoryIndex] = useState(0)
 
152
  category={currentCategory}
153
  score={categoryScores[currentCategory.id]}
154
  onScoreUpdate={(score) => onScoreUpdate(currentCategory.id, score)}
155
+ onSaveDetailed={(catId, data) => onSaveDetailed?.(catId, data)}
156
  />
157
  )}
158
 
components/system-info-form.tsx CHANGED
@@ -10,45 +10,27 @@ import { Label } from "@/components/ui/label"
10
  import { Checkbox } from "@/components/ui/checkbox"
11
  import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group"
12
  import type { SystemInfo } from "@/components/ai-evaluation-dashboard"
 
13
 
14
  interface SystemInfoFormProps {
15
  onSubmit: (data: SystemInfo) => void
16
  initialData: SystemInfo | null
17
  }
18
 
19
- const SYSTEM_TYPES = [
20
- "Text-to-Text (e.g., chatbots, language models)",
21
- "Text-to-Image (e.g., image generation)",
22
- "Image-to-Text (e.g., image captioning, OCR)",
23
- "Image-to-Image (e.g., image editing, style transfer)",
24
- "Audio/Speech (e.g., speech recognition, text-to-speech)",
25
- "Video (e.g., video generation, analysis)",
26
- "Multimodal",
27
- "Robotic/Embodied AI",
28
- "Other",
29
- ]
30
-
31
- const DEPLOYMENT_CONTEXTS = [
32
- "Research/Academic",
33
- "Internal/Enterprise Use",
34
- "Public/Consumer-Facing",
35
- "High-Risk Applications",
36
- "Other",
37
- ]
38
 
39
  export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
40
  const [formData, setFormData] = useState<SystemInfo>({
41
  name: initialData?.name || "",
42
  url: initialData?.url || "",
43
  provider: initialData?.provider || "",
44
- systemTypes: initialData?.systemTypes || [],
45
- deploymentContexts: initialData?.deploymentContexts || [],
46
- modality: initialData?.modality || "text",
47
- modelTag: (initialData as any)?.modelTag || "",
48
- knowledgeCutoff: (initialData as any)?.knowledgeCutoff || "",
49
- modelType: (initialData as any)?.modelType || "na",
50
- inputModalities: (initialData as any)?.inputModalities || [],
51
- outputModalities: (initialData as any)?.outputModalities || [],
52
  })
53
 
54
  const handleSubmit = (e: React.FormEvent) => {
@@ -56,13 +38,6 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
56
  onSubmit(formData)
57
  }
58
 
59
- const handleSystemTypeChange = (type: string, checked: boolean) => {
60
- setFormData((prev: SystemInfo) => ({
61
- ...prev,
62
- systemTypes: checked ? [...prev.systemTypes, type] : prev.systemTypes.filter((t) => t !== type),
63
- }))
64
- }
65
-
66
  const handleDeploymentContextChange = (context: string, checked: boolean) => {
67
  setFormData((prev: SystemInfo) => ({
68
  ...prev,
@@ -113,6 +88,18 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
113
  placeholder="e.g., gpt-4-1-2025-04-14"
114
  />
115
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
116
  <div className="space-y-2">
117
  <Label htmlFor="knowledgeCutoff">Knowledge Cutoff Date</Label>
118
  <Input
@@ -122,17 +109,16 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
122
  placeholder="YYYY-MM-DD"
123
  />
124
  </div>
125
- </div>
126
-
127
- <div className="space-y-2">
128
- <Label htmlFor="provider">Provider/Organization *</Label>
129
- <Input
130
- id="provider"
131
- value={formData.provider}
132
- onChange={(e) => setFormData((prev) => ({ ...prev, provider: e.target.value }))}
133
- placeholder="e.g., OpenAI, Anthropic, Internal Team"
134
- required
135
- />
136
  </div>
137
 
138
  <div className="space-y-4">
@@ -141,18 +127,12 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
141
  <div className="mt-3">
142
  <RadioGroup value={formData.modelType} onValueChange={(val) => setFormData((prev) => ({ ...prev, modelType: val as any }))}>
143
  <div className="flex items-center gap-4">
144
- <div className="flex items-center gap-2">
145
- <RadioGroupItem value="foundational" id="mt-foundational" />
146
- <Label htmlFor="mt-foundational" className="text-sm">Foundational Model</Label>
147
- </div>
148
- <div className="flex items-center gap-2">
149
- <RadioGroupItem value="fine-tuned" id="mt-finetuned" />
150
- <Label htmlFor="mt-finetuned" className="text-sm">Fine-tuned Model</Label>
151
- </div>
152
- <div className="flex items-center gap-2">
153
- <RadioGroupItem value="na" id="mt-na" />
154
- <Label htmlFor="mt-na" className="text-sm">Doesn't apply</Label>
155
- </div>
156
  </div>
157
  </RadioGroup>
158
  </div>
@@ -161,7 +141,7 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
161
  <div>
162
  <Label className="text-base font-medium">Input modalities (select all that apply) *</Label>
163
  <div className="grid grid-cols-1 md:grid-cols-2 gap-3 mt-3">
164
- {['Text','Image','Audio','Video','Tabular','Robotics/Action','Other'].map((m) => (
165
  <div key={m} className="flex items-center gap-2">
166
  <Checkbox
167
  id={`in-${m}`}
@@ -186,7 +166,7 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
186
  <div>
187
  <Label className="text-base font-medium">Output modalities (select all that apply) *</Label>
188
  <div className="grid grid-cols-1 md:grid-cols-2 gap-3 mt-3">
189
- {['Text','Image','Audio','Video','Tabular','Robotics/Action','Other'].map((m) => (
190
  <div key={m} className="flex items-center gap-2">
191
  <Checkbox
192
  id={`out-${m}`}
@@ -211,7 +191,7 @@ export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
211
  <div>
212
  <Label className="text-base font-medium">Deployment Context (select all that apply) *</Label>
213
  <div className="grid grid-cols-1 md:grid-cols-2 gap-3 mt-3">
214
- {DEPLOYMENT_CONTEXTS.map((context) => (
215
  <div key={context} className="flex items-center space-x-2">
216
  <Checkbox
217
  id={`context-${context}`}
 
10
  import { Checkbox } from "@/components/ui/checkbox"
11
  import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group"
12
  import type { SystemInfo } from "@/components/ai-evaluation-dashboard"
13
+ import { getSystemInfoFormOptions } from "@/lib/schema"
14
 
15
  interface SystemInfoFormProps {
16
  onSubmit: (data: SystemInfo) => void
17
  initialData: SystemInfo | null
18
  }
19
 
20
+ const formOptions = getSystemInfoFormOptions()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  export function SystemInfoForm({ onSubmit, initialData }: SystemInfoFormProps) {
23
  const [formData, setFormData] = useState<SystemInfo>({
24
  name: initialData?.name || "",
25
  url: initialData?.url || "",
26
  provider: initialData?.provider || "",
27
+ version: initialData?.version || "",
28
+ deploymentContexts: initialData?.deploymentContexts || [],
29
+ modelTag: initialData?.modelTag || "",
30
+ knowledgeCutoff: initialData?.knowledgeCutoff || "",
31
+ modelType: initialData?.modelType || "na",
32
+ inputModalities: initialData?.inputModalities || [],
33
+ outputModalities: initialData?.outputModalities || [],
 
34
  })
35
 
36
  const handleSubmit = (e: React.FormEvent) => {
 
38
  onSubmit(formData)
39
  }
40
 
 
 
 
 
 
 
 
41
  const handleDeploymentContextChange = (context: string, checked: boolean) => {
42
  setFormData((prev: SystemInfo) => ({
43
  ...prev,
 
88
  placeholder="e.g., gpt-4-1-2025-04-14"
89
  />
90
  </div>
91
+ <div className="space-y-2">
92
+ <Label htmlFor="version">System Version</Label>
93
+ <Input
94
+ id="version"
95
+ value={formData.version}
96
+ onChange={(e) => setFormData((prev) => ({ ...prev, version: e.target.value }))}
97
+ placeholder="e.g., v3.2.1"
98
+ />
99
+ </div>
100
+ </div>
101
+
102
+ <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
103
  <div className="space-y-2">
104
  <Label htmlFor="knowledgeCutoff">Knowledge Cutoff Date</Label>
105
  <Input
 
109
  placeholder="YYYY-MM-DD"
110
  />
111
  </div>
112
+ <div className="space-y-2">
113
+ <Label htmlFor="provider">Provider/Organization *</Label>
114
+ <Input
115
+ id="provider"
116
+ value={formData.provider}
117
+ onChange={(e) => setFormData((prev) => ({ ...prev, provider: e.target.value }))}
118
+ placeholder="e.g., OpenAI, Anthropic, Internal Team"
119
+ required
120
+ />
121
+ </div>
 
122
  </div>
123
 
124
  <div className="space-y-4">
 
127
  <div className="mt-3">
128
  <RadioGroup value={formData.modelType} onValueChange={(val) => setFormData((prev) => ({ ...prev, modelType: val as any }))}>
129
  <div className="flex items-center gap-4">
130
+ {formOptions.modelTypes.map((type) => (
131
+ <div key={type.value} className="flex items-center gap-2">
132
+ <RadioGroupItem value={type.value} id={`mt-${type.value}`} />
133
+ <Label htmlFor={`mt-${type.value}`} className="text-sm">{type.label}</Label>
134
+ </div>
135
+ ))}
 
 
 
 
 
 
136
  </div>
137
  </RadioGroup>
138
  </div>
 
141
  <div>
142
  <Label className="text-base font-medium">Input modalities (select all that apply) *</Label>
143
  <div className="grid grid-cols-1 md:grid-cols-2 gap-3 mt-3">
144
+ {formOptions.modalities.map((m) => (
145
  <div key={m} className="flex items-center gap-2">
146
  <Checkbox
147
  id={`in-${m}`}
 
166
  <div>
167
  <Label className="text-base font-medium">Output modalities (select all that apply) *</Label>
168
  <div className="grid grid-cols-1 md:grid-cols-2 gap-3 mt-3">
169
+ {formOptions.modalities.map((m) => (
170
  <div key={m} className="flex items-center gap-2">
171
  <Checkbox
172
  id={`out-${m}`}
 
191
  <div>
192
  <Label className="text-base font-medium">Deployment Context (select all that apply) *</Label>
193
  <div className="grid grid-cols-1 md:grid-cols-2 gap-3 mt-3">
194
+ {formOptions.deploymentContexts.map((context) => (
195
  <div key={context} className="flex items-center space-x-2">
196
  <Checkbox
197
  id={`context-${context}`}
data/evaluations/claude-3-sonnet.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "id": "2",
3
- "systemName": "Claude 3.5 Sonnet",
4
- "provider": "Anthropic",
5
- "modality": "Text-to-Text",
6
- "version": "claude-3-5-sonnet-20241022",
7
- "deploymentContext": "API Service",
8
- "evaluationDate": "2024-12-10",
9
- "evaluator": "Anthropic Safety Team",
10
- "selectedCategories": [
11
- "language-communication",
12
- "social-intelligence",
13
- "problem-solving",
14
- "creativity-innovation",
15
- "learning-memory",
16
- "perception-vision",
17
- "metacognition",
18
- "harmful-content",
19
- "information-integrity",
20
- "privacy-data",
21
- "bias-fairness",
22
- "security-robustness",
23
- "dangerous-capabilities",
24
- "human-ai-interaction",
25
- "environmental-impact",
26
- "economic-displacement",
27
- "governance-accountability",
28
- "value-chain"
29
- ],
30
- "overallStats": {
31
- "totalApplicable": 18,
32
- "capabilityApplicable": 7,
33
- "riskApplicable": 11,
34
- "strongCategories": [
35
- "language-communication",
36
- "social-intelligence",
37
- "harmful-content",
38
- "information-integrity",
39
- "privacy-data",
40
- "bias-fairness",
41
- "security-robustness",
42
- "human-ai-interaction"
43
- ],
44
- "adequateCategories": [
45
- "problem-solving",
46
- "creativity-innovation",
47
- "learning-memory",
48
- "perception-vision",
49
- "metacognition",
50
- "dangerous-capabilities",
51
- "governance-accountability",
52
- "value-chain"
53
- ],
54
- "weakCategories": ["environmental-impact"],
55
- "insufficientCategories": ["economic-displacement"],
56
- "completenessScore": 92
57
- }
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/evaluations/fraud-detector.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "id": "4",
3
- "systemName": "FraudDetector AI",
4
- "provider": "FinTech Solutions",
5
- "modality": "Other",
6
- "version": "v2.3.1",
7
- "deploymentContext": "Production Banking System",
8
- "evaluationDate": "2024-12-05",
9
- "evaluator": "Internal Risk Team",
10
- "selectedCategories": [
11
- "problem-solving",
12
- "learning-memory",
13
- "harmful-content",
14
- "information-integrity",
15
- "privacy-data",
16
- "bias-fairness",
17
- "security-robustness",
18
- "governance-accountability",
19
- "value-chain"
20
- ],
21
- "overallStats": {
22
- "totalApplicable": 9,
23
- "capabilityApplicable": 2,
24
- "riskApplicable": 7,
25
- "strongCategories": ["problem-solving", "privacy-data", "security-robustness", "governance-accountability"],
26
- "adequateCategories": ["learning-memory", "information-integrity", "value-chain"],
27
- "weakCategories": ["harmful-content", "bias-fairness"],
28
- "insufficientCategories": [],
29
- "completenessScore": 78
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/evaluations/gemini-pro.json DELETED
@@ -1,58 +0,0 @@
1
- {
2
- "id": "3",
3
- "systemName": "Gemini Pro 1.5",
4
- "provider": "Google",
5
- "modality": "Multimodal",
6
- "version": "gemini-1.5-pro-002",
7
- "deploymentContext": "API Service",
8
- "evaluationDate": "2024-12-08",
9
- "evaluator": "Google DeepMind",
10
- "selectedCategories": [
11
- "language-communication",
12
- "social-intelligence",
13
- "problem-solving",
14
- "creativity-innovation",
15
- "learning-memory",
16
- "perception-vision",
17
- "metacognition",
18
- "harmful-content",
19
- "information-integrity",
20
- "privacy-data",
21
- "bias-fairness",
22
- "security-robustness",
23
- "dangerous-capabilities",
24
- "human-ai-interaction",
25
- "environmental-impact",
26
- "economic-displacement",
27
- "governance-accountability",
28
- "value-chain"
29
- ],
30
- "overallStats": {
31
- "totalApplicable": 18,
32
- "capabilityApplicable": 7,
33
- "riskApplicable": 11,
34
- "strongCategories": [
35
- "language-communication",
36
- "problem-solving",
37
- "perception-vision",
38
- "learning-memory",
39
- "metacognition",
40
- "information-integrity",
41
- "governance-accountability"
42
- ],
43
- "adequateCategories": [
44
- "social-intelligence",
45
- "creativity-innovation",
46
- "harmful-content",
47
- "privacy-data",
48
- "security-robustness",
49
- "dangerous-capabilities",
50
- "human-ai-interaction",
51
- "environmental-impact",
52
- "value-chain"
53
- ],
54
- "weakCategories": ["bias-fairness"],
55
- "insufficientCategories": ["economic-displacement"],
56
- "completenessScore": 87
57
- }
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/evaluations/gpt-4-turbo.json DELETED
@@ -1,268 +0,0 @@
1
- {
2
- "id": "1",
3
- "systemName": "GPT-4 Turbo",
4
- "provider": "OpenAI",
5
- "modality": "Text-to-Text",
6
- "version": "gpt-4-turbo-2024-04-09",
7
- "deploymentContext": "API Service",
8
- "evaluationDate": "2024-12-15",
9
- "evaluator": "OpenAI Safety Team",
10
- "selectedCategories": [
11
- "language-communication",
12
- "social-intelligence",
13
- "problem-solving",
14
- "creativity-innovation",
15
- "learning-memory",
16
- "perception-vision",
17
- "metacognition",
18
- "harmful-content",
19
- "information-integrity",
20
- "privacy-data",
21
- "bias-fairness",
22
- "security-robustness",
23
- "dangerous-capabilities",
24
- "human-ai-interaction",
25
- "environmental-impact",
26
- "economic-displacement",
27
- "governance-accountability",
28
- "value-chain"
29
- ],
30
- "categoryEvaluations": {
31
- "language-communication": {
32
- "benchmarkAnswers": {
33
- "A1": "yes",
34
- "A2": "yes",
35
- "A3": "yes",
36
- "A4": "yes",
37
- "A5": "yes",
38
- "A6": "yes"
39
- },
40
- "processAnswers": {
41
- "B1": "yes",
42
- "B2": "yes",
43
- "B5": [
44
- "yes"
45
- ],
46
- "B6": "yes"
47
- },
48
- "benchmarkSources": {
49
- "A1": [
50
- {
51
- "id": "1",
52
- "benchmarkName": "MMLU",
53
- "version": "2023-05",
54
- "taskVariants": "multiple choice",
55
- "metrics": "accuracy",
56
- "url": "https://github.com/hendrycks/test",
57
- "description": "Massive Multitask Language Understanding benchmark covering 57 academic subjects",
58
- "sourceType": "external",
59
- "score": "86.4%",
60
- "customFields": {}
61
- }
62
- ],
63
- "A2": [
64
- {
65
- "id": "2",
66
- "benchmarkName": "TruthfulQA",
67
- "version": "v1.0",
68
- "taskVariants": "generation",
69
- "metrics": "truthfulness rate",
70
- "url": "https://github.com/sylinrl/TruthfulQA",
71
- "description": "Measures truthfulness in language models",
72
- "sourceType": "external",
73
- "score": "Pass - 78% truthful",
74
- "customFields": {
75
- "thresholds": ">70% truthfulness rate",
76
- "regulatorySource": "AI safety guidelines",
77
- "complianceStatus": "Compliant"
78
- }
79
- }
80
- ],
81
- "A3": [
82
- {
83
- "id": "3",
84
- "benchmarkName": "HellaSwag",
85
- "version": "v1.1",
86
- "taskVariants": "completion",
87
- "metrics": "accuracy",
88
- "url": "https://rowanzellers.com/hellaswag/",
89
- "description": "Commonsense reasoning benchmark",
90
- "sourceType": "external",
91
- "score": "95.3%",
92
- "customFields": {
93
- "comparativeScores": "GPT-4 Turbo: 95.3%, GPT-3.5: 85.5%, Claude-3: 88.0%",
94
- "comparisonTargets": "GPT-3.5, Claude-3, Gemini Pro",
95
- "significance": "p<0.001, 95% CI: [94.8, 95.8]"
96
- }
97
- }
98
- ],
99
- "A4": [
100
- {
101
- "id": "4",
102
- "benchmarkName": "AdvGLUE",
103
- "version": "v1.0",
104
- "taskVariants": "adversarial",
105
- "metrics": "robustness score",
106
- "url": "https://adversarialglue.github.io/",
107
- "description": "Adversarial evaluation of language understanding",
108
- "sourceType": "external",
109
- "score": "72.1%",
110
- "customFields": {
111
- "testTypes": "adversarial attacks, paraphrase attacks, typo attacks",
112
- "failureRates": "28% degradation under adversarial inputs",
113
- "robustnessMetrics": "attack success rate: 15%, performance drop: 12%"
114
- }
115
- }
116
- ],
117
- "A5": [
118
- {
119
- "id": "5",
120
- "benchmarkName": "Production Monitoring",
121
- "version": "v2.1",
122
- "taskVariants": "live traffic",
123
- "metrics": "error rate, latency",
124
- "url": "https://openai.com/safety",
125
- "description": "Real-time monitoring of production API",
126
- "sourceType": "internal",
127
- "score": "99.9% uptime",
128
- "customFields": {
129
- "liveMetrics": "error rates, response latency, content policy violations",
130
- "samplingCadence": "every 1000 requests",
131
- "alertThresholds": ">1% error rate, >2s latency"
132
- }
133
- }
134
- ],
135
- "A6": [
136
- {
137
- "id": "6",
138
- "benchmarkName": "Contamination Analysis",
139
- "version": "v1.0",
140
- "taskVariants": "overlap detection",
141
- "metrics": "contamination rate",
142
- "url": "https://openai.com/research/contamination",
143
- "description": "Analysis of training-test data overlap",
144
- "sourceType": "internal",
145
- "score": "<0.5% overlap",
146
- "customFields": {
147
- "procedure": "13-gram overlap analysis, fuzzy matching, URL deduplication",
148
- "contaminationRate": "0.3% exact matches, 0.8% fuzzy matches",
149
- "mitigations": "removed overlapping samples, used holdout validation set"
150
- }
151
- }
152
- ]
153
- },
154
- "processSources": {
155
- "B1": [
156
- {
157
- "id": "7",
158
- "url": "https://openai.com/research/language-models",
159
- "description": "Language model evaluation methodology and scope definition",
160
- "sourceType": "internal",
161
- "documentType": "Research Paper",
162
- "customFields": {
163
- "scope": "Evaluates natural language understanding and generation across diverse domains",
164
- "successFailureDefinitions": "Success: >80% on standardized benchmarks, coherent generation",
165
- "hypotheses": "Model can understand context and generate human-like responses"
166
- }
167
- }
168
- ],
169
- "B2": [
170
- {
171
- "id": "8",
172
- "url": "https://github.com/openai/evals",
173
- "description": "OpenAI Evals framework for reproducible evaluations",
174
- "sourceType": "external",
175
- "documentType": "Code Repository",
176
- "customFields": {
177
- "replicationPackage": "GitHub repository with evaluation code, prompts, and configurations",
178
- "accessLevel": "Public repository with documented APIs",
179
- "proxies": "Synthetic examples provided for proprietary benchmarks"
180
- }
181
- }
182
- ],
183
- "B5": [
184
- {
185
- "id": "9",
186
- "url": "https://openai.com/safety/practices",
187
- "description": "External review process for model evaluations",
188
- "sourceType": "cooperative",
189
- "documentType": "Process Documentation",
190
- "customFields": {
191
- "reviewers": "Academic researchers, AI safety experts, domain specialists",
192
- "feedbackChanges": "Added bias metrics, revised safety thresholds",
193
- "disagreements": "Threshold levels for harmful content detection"
194
- }
195
- },
196
- {
197
- "id": "11",
198
- "url": "https://openai.com/safety/monitoring",
199
- "description": "Continuous monitoring and re-evaluation procedures",
200
- "sourceType": "internal",
201
- "documentType": "Standard Operating Procedure",
202
- "customFields": {
203
- "triggers": "Model updates, performance drift >5%, safety incidents",
204
- "versionedSpecs": "Evaluation specifications v3.2 with change tracking",
205
- "auditTrail": "All changes logged with timestamps and rationale",
206
- "mitigationProtocols": "Automated rollback procedures, manual safety review",
207
- "retestProcedures": "Full evaluation suite after fixes, regression testing"
208
- }
209
- }
210
- ],
211
- "B6": [
212
- {
213
- "id": "10",
214
- "url": "https://openai.com/research/model-cards",
215
- "description": "Model card with transparent reporting of capabilities and limitations",
216
- "sourceType": "internal",
217
- "documentType": "Model Card",
218
- "customFields": {
219
- "uncertaintyDisclosure": "95% confidence intervals, error bars on all metrics",
220
- "axesConsistency": "Consistent 0-100 scale, no truncated axes",
221
- "sampleSizes": "n=10,000 test samples, 5 random seeds",
222
- "selectionCriteria": "All results reported, no cherry-picking"
223
- }
224
- }
225
- ]
226
- },
227
- "additionalAspects": "Additional evaluation includes multilingual performance testing across 25 languages, domain-specific evaluations for medical and legal text, and long-context evaluation up to 128k tokens.",
228
- "score": {
229
- "benchmarkScore": 6,
230
- "processScore": 5,
231
- "totalScore": 11,
232
- "status": "strong"
233
- }
234
- }
235
- },
236
- "overallStats": {
237
- "totalApplicable": 18,
238
- "capabilityApplicable": 7,
239
- "riskApplicable": 11,
240
- "strongCategories": [
241
- "language-communication",
242
- "problem-solving",
243
- "creativity-innovation",
244
- "learning-memory",
245
- "metacognition",
246
- "governance-accountability"
247
- ],
248
- "adequateCategories": [
249
- "social-intelligence",
250
- "perception-vision",
251
- "information-integrity",
252
- "privacy-data",
253
- "security-robustness",
254
- "human-ai-interaction",
255
- "environmental-impact",
256
- "value-chain"
257
- ],
258
- "weakCategories": [
259
- "harmful-content",
260
- "bias-fairness",
261
- "dangerous-capabilities"
262
- ],
263
- "insufficientCategories": [
264
- "economic-displacement"
265
- ],
266
- "completenessScore": 89
267
- }
268
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/category-data.ts DELETED
@@ -1,828 +0,0 @@
1
- export const CATEGORY_DESCRIPTIONS = {
2
- "language-communication": {
3
- name: "Language & Communication",
4
- description:
5
- "Evaluates the system's ability to understand, generate, and engage in natural language communication across various contexts, languages, and communication styles.",
6
- type: "capability" as const,
7
- },
8
- "social-intelligence": {
9
- name: "Social Intelligence & Interaction",
10
- description:
11
- "Assesses the system's capacity to understand social contexts, interpret human emotions and intentions, and engage appropriately in social interactions.",
12
- type: "capability" as const,
13
- },
14
- "problem-solving": {
15
- name: "Problem Solving",
16
- description:
17
- "Measures the system's ability to analyze complex problems, develop solutions, and apply reasoning across various domains and contexts.",
18
- type: "capability" as const,
19
- },
20
- "creativity-innovation": {
21
- name: "Creativity & Innovation",
22
- description:
23
- "Evaluates the system's capacity for creative thinking, generating novel ideas, and producing original content across different creative domains.",
24
- type: "capability" as const,
25
- },
26
- "learning-memory": {
27
- name: "Learning & Memory",
28
- description:
29
- "Assesses the system's ability to acquire new knowledge, retain information, and adapt behavior based on experience and feedback.",
30
- type: "capability" as const,
31
- },
32
- "perception-vision": {
33
- name: "Perception & Vision",
34
- description:
35
- "Measures the system's capability to process, interpret, and understand visual information, images, and spatial relationships.",
36
- type: "capability" as const,
37
- },
38
- "physical-manipulation": {
39
- name: "Physical Manipulation & Motor Skills",
40
- description:
41
- "Evaluates the system's ability to control physical actuators, manipulate objects, and perform motor tasks in physical environments.",
42
- type: "capability" as const,
43
- },
44
- metacognition: {
45
- name: "Metacognition & Self-Awareness",
46
- description:
47
- "Assesses the system's ability to understand its own capabilities, limitations, and reasoning processes, including self-reflection and meta-learning.",
48
- type: "capability" as const,
49
- },
50
- "robotic-intelligence": {
51
- name: "Robotic Intelligence & Autonomy",
52
- description:
53
- "Measures autonomous decision-making, navigation, and task execution in physical environments with minimal human intervention.",
54
- type: "capability" as const,
55
- },
56
- "harmful-content": {
57
- name: "Harmful Content Generation",
58
- description:
59
- "Evaluates risks related to generating harmful, offensive, illegal, or inappropriate content that could cause psychological, social, or physical harm.",
60
- type: "risk" as const,
61
- },
62
- "information-integrity": {
63
- name: "Information Integrity & Misinformation",
64
- description:
65
- "Assesses risks of generating false, misleading, or manipulated information that could undermine trust in information systems and decision-making.",
66
- type: "risk" as const,
67
- },
68
- "privacy-data": {
69
- name: "Privacy & Data Protection",
70
- description:
71
- "Evaluates risks to personal privacy, data security, and unauthorized access to or misuse of sensitive personal information.",
72
- type: "risk" as const,
73
- },
74
- "bias-fairness": {
75
- name: "Bias & Fairness",
76
- description:
77
- "Assesses risks of discriminatory outcomes, unfair treatment of different groups, and perpetuation of societal biases and inequalities.",
78
- type: "risk" as const,
79
- },
80
- "security-robustness": {
81
- name: "Security & Robustness",
82
- description:
83
- "Evaluates vulnerabilities to adversarial attacks, system manipulation, and failure modes that could compromise system integrity and reliability.",
84
- type: "risk" as const,
85
- },
86
- "dangerous-capabilities": {
87
- name: "Dangerous Capabilities & Misuse",
88
- description:
89
- "Assesses risks from capabilities that could be misused for harmful purposes, including dual-use applications and potential for weaponization.",
90
- type: "risk" as const,
91
- },
92
- "human-ai-interaction": {
93
- name: "Human-AI Interaction Risks",
94
- description:
95
- "Evaluates risks arising from human-AI interaction patterns, including over-reliance, manipulation, and degradation of human skills and autonomy.",
96
- type: "risk" as const,
97
- },
98
- "environmental-impact": {
99
- name: "Environmental & Resource Impact",
100
- description:
101
- "Evaluates environmental costs of AI development and deployment, including energy consumption, carbon footprint, and resource utilization.",
102
- type: "risk" as const,
103
- },
104
- "economic-displacement": {
105
- name: "Economic & Labor Displacement",
106
- description:
107
- "Evaluates potential economic disruption, job displacement, and impacts on labor markets and economic inequality from AI deployment.",
108
- type: "risk" as const,
109
- },
110
- "governance-accountability": {
111
- name: "Governance & Accountability",
112
- description:
113
- "Assesses risks related to lack of oversight, unclear responsibility structures, and insufficient governance mechanisms for AI systems.",
114
- type: "risk" as const,
115
- },
116
- "value-chain": {
117
- name: "Value Chain & Supply Chain Risks",
118
- description:
119
- "Evaluates risks throughout the AI development and deployment pipeline, including data sourcing, model training, and third-party dependencies.",
120
- type: "risk" as const,
121
- },
122
- }
123
-
124
- export const SOURCE_TYPES = {
125
- internal: {
126
- label: "Internal",
127
- description:
128
- "Evaluations conducted by the organization developing or deploying the AI system using internal resources, teams, and methodologies.",
129
- },
130
- external: {
131
- label: "External",
132
- description:
133
- "Independent evaluations conducted by third-party organizations, academic institutions, or external auditors without direct involvement from the developing organization.",
134
- },
135
- cooperative: {
136
- label: "Cooperative",
137
- description:
138
- "Collaborative evaluations involving multiple stakeholders, including the developing organization, external experts, affected communities, and regulatory bodies working together.",
139
- },
140
- } as const
141
-
142
- export const BENCHMARK_QUESTIONS = [
143
- {
144
- id: "A1",
145
- text: "Has the system been run on recognized, category-specific benchmarks?",
146
- tooltip:
147
- "Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).",
148
- customFields: [],
149
- },
150
- {
151
- id: "A2",
152
- text: "Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?",
153
- tooltip:
154
- "Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.",
155
- customFields: ["thresholds", "regulatorySource", "complianceStatus"],
156
- },
157
- {
158
- id: "A3",
159
- text: "How does performance compare to baselines, SOTA, previous versions, and other comparable systems?",
160
- tooltip:
161
- "Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
162
- customFields: ["comparativeScores", "comparisonTargets", "significance"],
163
- },
164
- {
165
- id: "A4",
166
- text: "How does the system perform under adversarial inputs, extreme loads, distribution shift?",
167
- tooltip: "Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
168
- customFields: ["testTypes", "failureRates", "robustnessMetrics"],
169
- },
170
- {
171
- id: "A5",
172
- text: "Is performance measured in the wild with automated monitors?",
173
- tooltip: "Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.",
174
- customFields: ["liveMetrics", "samplingCadence", "alertThresholds"],
175
- },
176
- {
177
- id: "A6",
178
- text: "Have you quantified train–test overlap or leakage risks that could inflate results?",
179
- tooltip:
180
- "Expect: Procedure (e.g., n-gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.",
181
- customFields: ["procedure", "contaminationRate", "mitigations"],
182
- },
183
- ]
184
-
185
- export const PROCESS_QUESTIONS = [
186
- {
187
- id: "B1",
188
- text: "What capability/risk claims is this category evaluating and why it's applicable?",
189
- tooltip: "Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
190
- customFields: ["scope", "successFailureDefinitions", "hypotheses"],
191
- },
192
- {
193
- id: "B2",
194
- text: "Can others reproduce the results?",
195
- tooltip:
196
- "Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.",
197
- customFields: ["replicationPackage", "accessLevel", "proxies"],
198
- },
199
- {
200
- id: "B3",
201
- text: "Have domain experts/affected users reviewed interpretations of results?",
202
- tooltip: "Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.",
203
- customFields: ["reviewers", "feedbackChanges", "disagreements"],
204
- },
205
- {
206
- id: "B4",
207
- text: "Do figures communicate results without distortion and with uncertainty/context?",
208
- tooltip:
209
- "Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.",
210
- customFields: ["uncertaintyDisclosure", "axesConsistency", "sampleSizes", "selectionCriteria"],
211
- },
212
- {
213
- id: "B5",
214
- text: "Standards & Compliance Alignment - Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
215
- tooltip:
216
- "Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
217
- customFields: ["standardsReferenced", "alignmentSummary"],
218
- },
219
- {
220
- id: "B6",
221
- text: "Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?",
222
- tooltip:
223
- "Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.",
224
- customFields: ["triggers", "versionedSpecs", "auditTrail", "mitigationProtocols", "retestProcedures"],
225
- },
226
- ]
227
-
228
- export const ADDITIONAL_ASPECTS_SECTION = {
229
- id: "C",
230
- title: "Additional Evaluation Aspects",
231
- description:
232
- "Document any other evaluation aspects for this category that may not have been captured by the structured questions above. This section will not be scored but will be visible in the final documentation.",
233
- }
234
-
235
- export const CATEGORY_DETAILED_GUIDANCE = {
236
- "language-communication": `Key Benchmarks to Look For:
237
- General: MMLU, HellaSwag, ARC, WinoGrande
238
- Reading Comprehension: SQuAD, QuAC, CoQA
239
- Language Generation: BLEU, ROUGE, BERTScore
240
- Multilingual: XTREME, XGLUE, mBERT evaluation
241
- Reasoning: GSM8K, BBH (BIG-Bench Hard)
242
- Instruction Following: Alpaca Eval, MT-Bench
243
-
244
- Evaluation Focus:
245
- • Semantic understanding across languages
246
- • Text generation quality and coherence
247
- • Reasoning and logical inference
248
- • Context retention in long conversations
249
- • Factual accuracy and knowledge recall
250
-
251
- Common Risk Areas:
252
- • Hallucination and misinformation generation
253
- • Bias in language generation
254
- • Inconsistent performance across languages`,
255
-
256
- "social-intelligence": `Key Benchmarks to Look For:
257
- Theory of Mind: ToMi, FaINoM, SOTOPIA
258
- Emotional Intelligence: EmoBench, EQBench
259
- Social Reasoning: Social IQa, CommonsenseQA
260
- Dialogue: PersonaChat, BlendedSkillTalk
261
- Psychology: Psychometrics Benchmark for LLMs
262
-
263
- Evaluation Focus:
264
- • Understanding social cues and context
265
- • Appropriate emotional responses
266
- • Maintaining consistent personality
267
- • Theory of mind reasoning
268
- • Cultural sensitivity and awareness
269
-
270
- Common Risk Areas:
271
- • Inappropriate anthropomorphization
272
- • Cultural bias and insensitivity
273
- • Lack of emotional regulation
274
- • Manipulation potential`,
275
-
276
- "problem-solving": `Key Benchmarks to Look For:
277
- Mathematical: GSM8K, MATH, FrontierMath, AIME
278
- Logical Reasoning: LogiQA, ReClor, FOLIO
279
- Programming: HumanEval, MBPP, SWE-bench
280
- Scientific: SciQ, ScienceQA
281
- Multi-step: StrategyQA, DROP, QuALITY
282
-
283
- Evaluation Focus:
284
- • Multi-step reasoning capability
285
- • Mathematical and logical problem solving
286
- • Code generation and debugging
287
- • Scientific and analytical thinking
288
- • Planning and strategy development
289
-
290
- Common Risk Areas:
291
- • Reasoning errors in complex problems
292
- • Inconsistent problem-solving approaches
293
- • Inability to show work or explain reasoning`,
294
-
295
- "creativity-innovation": `Key Benchmarks to Look For:
296
- Creative Writing: CREAM, Creative Story Generation
297
- Visual Creativity: FIQ (Figural Interpretation Quest)
298
- Alternative Uses: AUT (Alternative Uses Task)
299
- Artistic Generation: Aesthetic and originality scoring
300
- Innovation: Novel solution generation tasks
301
-
302
- Evaluation Focus:
303
- • Originality and novelty of outputs
304
- • Artistic and creative quality
305
- • Ability to combine concepts innovatively
306
- • Divergent thinking capabilities
307
- • Value and usefulness of creative outputs
308
-
309
- Common Risk Areas:
310
- • Copyright and IP infringement
311
- • Lack of genuine creativity vs. recombination
312
- • Inappropriate or harmful creative content`,
313
-
314
- "learning-memory": `Key Benchmarks to Look For:
315
- Few-shot Learning: Omniglot, miniImageNet, Meta-Dataset
316
- Transfer Learning: VTAB, BigTransfer
317
- In-context Learning: ICL benchmarks across domains
318
- Knowledge Retention: Long-term memory tests
319
- Continual Learning: CORe50, Split-CIFAR
320
-
321
- Evaluation Focus:
322
- • Few-shot and zero-shot learning ability
323
- • Knowledge transfer across domains
324
- • Memory retention and recall
325
- • Adaptation to new tasks
326
- • Learning efficiency and speed
327
-
328
- Common Risk Areas:
329
- • Catastrophic forgetting
330
- • Overfitting to limited examples
331
- • Inability to generalize learned concepts`,
332
-
333
- "perception-vision": `Key Benchmarks to Look For:
334
- Object Recognition: ImageNet, COCO, Open Images
335
- Scene Understanding: ADE20K, Cityscapes
336
- Robustness: ImageNet-C, ImageNet-A
337
- Multimodal: VQA, CLIP benchmarks
338
- 3D Understanding: NYU Depth, KITTI
339
-
340
- Evaluation Focus:
341
- • Object detection and classification
342
- • Scene understanding and segmentation
343
- • Robustness to visual variations
344
- • Integration with language understanding
345
- • Real-world deployment performance
346
-
347
- Common Risk Areas:
348
- • Adversarial vulnerability
349
- • Bias in image recognition
350
- • Poor performance on edge cases`,
351
-
352
- "physical-manipulation": `Key Benchmarks to Look For:
353
- Grasping: YCB Object Set, Functional Grasping
354
- Manipulation: RoboCAS, FMB (Functional Manipulation)
355
- Assembly: NIST Assembly Task Boards
356
- Navigation: Habitat, AI2-THOR challenges
357
- Dexterity: Dexterous manipulation benchmarks
358
-
359
- Evaluation Focus:
360
- • Grasping and manipulation accuracy
361
- • Adaptability to object variations
362
- • Force control and delicate handling
363
- • Spatial reasoning and planning
364
- • Real-world deployment robustness
365
-
366
- Common Risk Areas:
367
- • Safety in human environments
368
- • Damage to objects or environment
369
- • Inconsistent performance across conditions`,
370
-
371
- metacognition: `Key Benchmarks to Look For:
372
- Confidence Calibration: Calibration metrics, ECE
373
- Uncertainty Quantification: UQ benchmarks
374
- Self-Assessment: Metacognitive accuracy tests
375
- Know-Unknown: Known Unknowns benchmarks
376
- Error Detection: Self-correction capabilities
377
-
378
- Evaluation Focus:
379
- • Confidence calibration accuracy
380
- • Uncertainty expression and quantification
381
- • Self-monitoring and error detection
382
- • Knowledge boundary awareness
383
- • Adaptive reasoning based on confidence
384
-
385
- Common Risk Areas:
386
- • Overconfidence in incorrect responses
387
- • Poor uncertainty quantification
388
- • Inability to recognize knowledge limits`,
389
-
390
- "robotic-intelligence": `Key Benchmarks to Look For:
391
- Integrated Tasks: RoboCup, DARPA challenges
392
- Navigation: Habitat challenges, real-world navigation
393
- Manipulation: Integrated pick-and-place scenarios
394
- Human-Robot Interaction: HRI benchmarks
395
- Autonomy: Long-horizon task completion
396
-
397
- Evaluation Focus:
398
- • Integrated sensorimotor capabilities
399
- • Autonomous decision-making
400
- • Adaptability to dynamic environments
401
- • Human-robot collaboration
402
- • Long-term task execution
403
-
404
- Common Risk Areas:
405
- • Safety in unstructured environments
406
- • Unpredictable autonomous behavior
407
- • Failure to handle edge cases`,
408
-
409
- "harmful-content": `Key Evaluations to Look For:
410
- Safety Benchmarks: AIR-Bench, MLCommons AI Safety
411
- Red Teaming: Anthropic Constitutional AI, HarmBench
412
- Content Filtering: Jigsaw Toxic Comments, HASOC
413
- Adversarial: Jailbreaking attempts, prompt injection
414
- Regulatory: NIST AI RMF compliance
415
-
416
- Evaluation Focus:
417
- • Refusal to generate harmful content
418
- • Robustness against adversarial prompts
419
- • Content filtering effectiveness
420
- • Detection of subtle harmful content
421
- • Consistency across different prompt styles
422
-
423
- Critical Risk Areas:
424
- • Violence and self-harm content
425
- • Hate speech and discrimination
426
- • Illegal activity instructions
427
- • NSFW and inappropriate content`,
428
-
429
- "information-integrity": `Key Evaluations to Look For:
430
- Factuality: TruthfulQA, FEVER, HaluEval
431
- Hallucination Detection: SelfCheckGPT, FActScore
432
- Misinformation: LIAR dataset, fake news detection
433
- Citation Accuracy: Citation verification benchmarks
434
- Source Attribution: Provenance tracking tests
435
-
436
- Evaluation Focus:
437
- • Factual accuracy of generated content
438
- • Hallucination rate and detection
439
- • Proper source attribution
440
- • Misinformation resistance
441
- • Consistency across related queries
442
-
443
- Critical Risk Areas:
444
- • Medical misinformation
445
- • Political disinformation
446
- • False historical claims
447
- • Fabricated citations`,
448
-
449
- "privacy-data": `Key Evaluations to Look For:
450
- Membership Inference: MIA benchmarks, CopyMark
451
- Data Extraction: Training data extraction tests
452
- PII Detection: Personal information leakage tests
453
- Anonymization: De-identification benchmarks
454
- GDPR Compliance: Right to be forgotten tests
455
-
456
- Evaluation Focus:
457
- • Training data memorization
458
- • PII leakage prevention
459
- • Membership inference resistance
460
- • Data anonymization effectiveness
461
- • Compliance with privacy regulations
462
-
463
- Critical Risk Areas:
464
- • Personal information exposure
465
- • Training data memorization
466
- • Inference of sensitive attributes
467
- • Non-consensual data use`,
468
-
469
- "bias-fairness": `Key Evaluations to Look For:
470
- Bias Benchmarks: Winogender, CrowS-Pairs, BOLD
471
- Fairness Metrics: AI Fairness 360, Fairlearn
472
- Demographic Bias: Representation across groups
473
- Intersectional: Multi-dimensional bias analysis
474
- Allocative Fairness: Resource distribution equity
475
-
476
- Evaluation Focus:
477
- • Demographic representation fairness
478
- • Performance equity across groups
479
- • Intersectional bias analysis
480
- • Harmful stereotype perpetuation
481
- • Allocative fairness in decisions
482
-
483
- Critical Risk Areas:
484
- • Employment discrimination
485
- • Healthcare disparities
486
- • Educational bias
487
- • Criminal justice bias`,
488
-
489
- "security-robustness": `Key Evaluations to Look For:
490
- Adversarial Robustness: AdvBench, RobustBench
491
- Prompt Injection: AgentDojo, prompt injection tests
492
- Model Extraction: Model theft resistance
493
- Backdoor Detection: Trojaned model detection
494
- OWASP LLM Top 10: Security vulnerability assessment
495
-
496
- Evaluation Focus:
497
- • Adversarial attack resistance
498
- • Prompt injection robustness
499
- • Model extraction protection
500
- • Backdoor and trojan detection
501
- • Input validation effectiveness
502
-
503
- Critical Risk Areas:
504
- • Prompt injection attacks
505
- • Model theft and extraction
506
- • Adversarial examples
507
- • Supply chain attacks`,
508
-
509
- "dangerous-capabilities": `Key Evaluations to Look For:
510
- CBRN Assessment: WMD information evaluation
511
- Dual-Use: Misuse potential analysis
512
- Cyber Capabilities: Offensive cyber evaluation
513
- Weapons Information: Dangerous instruction filtering
514
- Government Protocols: AISI, NIST evaluation standards
515
-
516
- Evaluation Focus:
517
- • CBRN information filtering
518
- • Dual-use technology assessment
519
- • Offensive capability evaluation
520
- • Dangerous instruction refusal
521
- • Misuse potential quantification
522
-
523
- Critical Risk Areas:
524
- • Chemical/biological weapons info
525
- • Cyber attack capabilities
526
- • Physical harm instructions
527
- • Illegal activity facilitation`,
528
-
529
- "human-ai-interaction": `Key Evaluations to Look For:
530
- Trust Calibration: Trust-LLM, reliance calibration metrics
531
- Manipulation Detection: Emotional manipulation detection benchmarks
532
- Anthropomorphism: Human-likeness perception studies
533
- Safety in Dialogue: HAX, RealToxicityPrompts
534
- User Guidance: Task adherence and guidance clarity tests
535
-
536
- Evaluation Focus:
537
- • Preventing over-reliance on AI
538
- • Avoiding deceptive or manipulative responses
539
- • Maintaining transparency about capabilities and limitations
540
- • Providing safe, non-coercive interactions
541
- • Ensuring user agency and decision-making control
542
-
543
- Critical Risk Areas:
544
- • Emotional manipulation
545
- • Excessive trust leading to poor decisions
546
- • Misrepresentation of capabilities
547
- • Encouraging harmful behaviors`,
548
-
549
- "environmental-impact": `Key Evaluations to Look For:
550
- Energy Usage: Carbon footprint estimation tools
551
- Sustainability Metrics: Green AI benchmarks
552
- Model Efficiency: Inference cost evaluations
553
- Hardware Utilization: Resource optimization tests
554
- Lifecycle Assessment: Full training-to-deployment impact analysis
555
-
556
- Evaluation Focus:
557
- • Measuring carbon footprint and energy use
558
- • Optimizing for efficiency without performance loss
559
- • Assessing environmental trade-offs
560
- • Promoting sustainable deployment strategies
561
-
562
- Critical Risk Areas:
563
- • High carbon emissions from training
564
- • Excessive energy use in inference
565
- • Lack of transparency in environmental reporting`,
566
-
567
- "economic-displacement": `Key Evaluations to Look For:
568
- Job Impact Studies: Task automation potential assessments
569
- Market Disruption: Industry-specific displacement projections
570
- Economic Modeling: Macro and microeconomic simulations
571
- Skill Shift Analysis: Required workforce retraining benchmarks
572
- Societal Impact: Equitable distribution of economic benefits
573
-
574
- Evaluation Focus:
575
- • Predicting job displacement risks
576
- • Identifying emerging job opportunities
577
- • Understanding shifts in skill demand
578
- • Balancing automation benefits with societal costs
579
-
580
- Critical Risk Areas:
581
- • Large-scale unemployment
582
- • Wage suppression
583
- • Economic inequality`,
584
-
585
- "governance-accountability": `Key Evaluations to Look For:
586
- Transparency: Model card completeness, datasheet reporting
587
- Auditability: Traceability of decisions
588
- Oversight Mechanisms: Compliance with governance frameworks
589
- Responsibility Assignment: Clear chain of accountability
590
- Standards Compliance: ISO, IEEE AI standards adherence
591
-
592
- Evaluation Focus:
593
- • Establishing clear accountability
594
- • Ensuring decision traceability
595
- • Meeting compliance and ethical guidelines
596
- • Maintaining transparency across lifecycle
597
-
598
- Critical Risk Areas:
599
- • Lack of oversight
600
- • Unclear responsibility in failures
601
- • Insufficient transparency`,
602
-
603
- "value-chain": `Key Evaluations to Look For:
604
- Provenance Tracking: Dataset and component origin verification
605
- Third-Party Risk Assessment: Vendor dependency evaluations
606
- Supply Chain Security: Software and hardware integrity checks
607
- Integration Testing: Risk assessment in system integration
608
- Traceability: End-to-end component documentation
609
-
610
- Evaluation Focus:
611
- • Managing third-party dependencies
612
- • Verifying component provenance
613
- • Securing the supply chain
614
- • Mitigating integration risks
615
-
616
- Critical Risk Areas:
617
- • Compromised third-party components
618
- • Data provenance issues
619
- • Vendor lock-in and dependency risks`,
620
- }
621
-
622
- export const CATEGORIES = Object.entries(CATEGORY_DESCRIPTIONS).map(([id, data]) => ({
623
- id,
624
- ...data,
625
- detailedGuidance: CATEGORY_DETAILED_GUIDANCE[id as keyof typeof CATEGORY_DETAILED_GUIDANCE] || "",
626
- }))
627
-
628
- // Centralized hint mappings and recommended placeholders used by the UI.
629
- export const CATEGORY_HINTS: Record<string, { benchmark: string; process: string }> = {
630
- "language-communication": {
631
- benchmark:
632
- "Hint: mention benchmarks for language understanding/generation, prompt settings, multilingual splits, and whether factuality checks were performed.",
633
- process:
634
- "Hint: note consulted linguists or annotators, dataset provenance concerns, and any applicable content/regulatory considerations.",
635
- },
636
- "social-intelligence": {
637
- benchmark: "Hint: mention emotion/social reasoning benchmarks used, annotator protocols, and demographic coverage.",
638
- process: "Hint: list consulted domain experts (psychologists, sociologists), user study details, and consent/ethics notes.",
639
- },
640
- "problem-solving": {
641
- benchmark: "Hint: list math/programming/reasoning benchmarks, scoring rules, and seed/temperature settings.",
642
- process: "Hint: note expert reviewers, validation of solutions, and how ambiguous answers were adjudicated.",
643
- },
644
- "creativity-innovation": {
645
- benchmark: "Hint: mention creative evaluation setups, human rating protocols, and diversity of prompts/tasks.",
646
- process: "Hint: note creative experts or juries consulted, copyright/IP checks, and content filtering policies.",
647
- },
648
- "learning-memory": {
649
- benchmark: "Hint: indicate few-shot/transfer benchmarks, replay/continual learning setups, and sample sizes.",
650
- process: "Hint: describe retention tests, dataset refresh cadence, and any contamination checks performed.",
651
- },
652
- "perception-vision": {
653
- benchmark: "Hint: list vision datasets, augmentation/robustness tests, and evaluation resolutions/settings.",
654
- process: "Hint: note labelling protocols, demographic coverage of imagery, and reviewer/ethical considerations.",
655
- },
656
- "physical-manipulation": {
657
- benchmark: "Hint: mention robotics tasks, real/sim evaluation conditions, and safety/collision metrics.",
658
- process: "Hint: include safety review notes, field test observers, and incident mitigation procedures.",
659
- },
660
- "metacognition": {
661
- benchmark: "Hint: report calibration metrics, uncertainty quantification methods, and multi-seed variance.",
662
- process: "Hint: list reviewers who evaluated uncertainty reporting and any user-facing confidence disclosures.",
663
- },
664
- "robotic-intelligence": {
665
- benchmark: "Hint: note integrated task suites, sim-to-real gaps, and hardware/configuration details.",
666
- process: "Hint: document safety reviews, human-in-the-loop safeguards, and autonomy limits.",
667
- },
668
- "harmful-content": {
669
- benchmark: "Hint: describe toxicity/harm benchmarks, prompt hardening, and red-team scenarios used.",
670
- process: "Hint: list safety reviewers, incident response plans, and content moderation policies referenced.",
671
- },
672
- "information-integrity": {
673
- benchmark: "Hint: mention fact-checking datasets, prompt calibrations, and hallucination detection metrics.",
674
- process: "Hint: note expert fact-checkers consulted, provenance practices, and external audit reports.",
675
- },
676
- "privacy-data": {
677
- benchmark: "Hint: include privacy tests, membership inference/MI defenses, and redaction results.",
678
- process: "Hint: list privacy officers consulted, data handling policies, and any regulatory mappings (e.g., GDPR).",
679
- },
680
- "bias-fairness": {
681
- benchmark: "Hint: indicate fairness metrics, subgroup breakdowns, and statistical significance of gaps.",
682
- process: "Hint: document which stakeholder groups and domain experts were engaged and mitigation steps taken.",
683
- },
684
- "security-robustness": {
685
- benchmark: "Hint: report adversarial tests, perturbation strengths, and failure rates under attack.",
686
- process: "Hint: include red-team summaries, security reviewers, and incident response procedures.",
687
- },
688
- "dangerous-capabilities": {
689
- benchmark: "Hint: describe tests for dual-use behaviors and misuse scenarios evaluated.",
690
- process: "Hint: note external safety reviews, legal counsel input, and controls/mitigations in place.",
691
- },
692
- "human-ai-interaction": {
693
- benchmark: "Hint: list usability/UX tasks, user study protocols, and measures of over-reliance or deception.",
694
- process: "Hint: capture which user groups were involved, consent procedures, and human factors reviewers.",
695
- },
696
- "environmental-impact": {
697
- benchmark: "Hint: report energy/perf tradeoff tests, FLOPs/throughput, and measured carbon estimates.",
698
- process: "Hint: include sustainability reviewers, lifecycle assessment notes, and mitigation plans.",
699
- },
700
- "economic-displacement": {
701
- benchmark: "Hint: mention labor-impact scenarios evaluated and economic modeling assumptions used.",
702
- process: "Hint: document stakeholder consultations, affected worker groups engaged, and mitigation strategies.",
703
- },
704
- "governance-accountability": {
705
- benchmark: "Hint: N/A for benchmarking; focus on process evidence instead.",
706
- process: "Hint: cite governance frameworks used, responsible owners, and escalation/audit trails.",
707
- },
708
- "value-chain": {
709
- benchmark: "Hint: include supply-chain dependency tests, third-party component assessments if applicable.",
710
- process: "Hint: note vendor audits, data sourcing reviews, and contractual safeguards.",
711
- },
712
- }
713
-
714
- export const CATEGORY_QUESTION_HINTS: Record<
715
- string,
716
- Record<string, { benchmark?: string; process?: string }>
717
- > = {
718
- "language-communication": {
719
- A1: { benchmark: "List exact language benchmarks, dataset versions, prompt templates, split (train/val/test), and evaluation conditions." },
720
- A2: { benchmark: "State numeric thresholds and which regulatory or domain thresholds apply (e.g., accuracy, FPR/FNR targets)." },
721
- A3: { benchmark: "Provide side-by-side comparisons vs. baselines/SOTA, significance tests, and matched prompt/hyperparams." },
722
- A4: { benchmark: "Describe adversarial or distribution-shift tests (prompt perturbations, paraphrase attacks) and failure rates." },
723
- A5: { benchmark: "Explain live monitoring metrics (latency, error rate, hallucination rate), sampling cadence, and alert rules." },
724
- A6: { benchmark: "Document overlap checks (n‑gram, URL hashing), contamination rates, and mitigation steps taken." },
725
- B1: { process: "Define scope, claims being evaluated, success criteria (e.g., BLEU/F1 cutoffs), and evaluation hypotheses." },
726
- B2: { process: "List reproducibility artifacts (code, prompts, seeds), availability level, and proxies if materials are restricted." },
727
- B3: { process: "Name reviewers (linguists, annotators), review protocol, and how feedback was incorporated or adjudicated." },
728
- B4: { process: "Show how figures present uncertainty (CI, SE), axes choices, sample sizes, and raw tables for transparency." },
729
- B5: { process: "Reference any applicable standards (e.g., ISO, domain regs), mapping to practices, and noted gaps." },
730
- B6: { process: "Describe re-eval triggers (model updates, drift), versioned specs, audit trails, and retest procedures." },
731
- },
732
-
733
- "social-intelligence": {
734
- A1: { benchmark: "Cite emotion/social reasoning datasets, demographic breakdowns, and versioned splits used." },
735
- A2: { benchmark: "Specify thresholds for social safety metrics or fairness targets and how they were derived." },
736
- A3: { benchmark: "Compare against human baselines and prior models; include inter-rater agreement for subjective tasks." },
737
- A4: { benchmark: "Report robustness to adversarial roleplays, toxic prompts, or context manipulation and failure patterns." },
738
- A5: { benchmark: "Document in-field monitoring of social interactions, escalations, and rates of inappropriate responses." },
739
- A6: { benchmark: "Show contamination checks for dialogue datasets and steps to remove sensitive or toxic examples." },
740
- B1: { process: "Explain the claim scope (e.g., empathy, intent detection) and how applicability was determined." },
741
- B2: { process: "Provide reproduction artifacts or explain why dialogue data/prompts cannot be shared and offer proxies." },
742
- B3: { process: "List domain experts (psychologists, sociologists), study protocols, consent, and key feedback items." },
743
- B4: { process: "Ensure visualizations include uncertainty in subjective ratings and avoid misleading aggregations." },
744
- B5: { process: "Map evaluation to relevant ethical or safety standards and note any compliance gaps." },
745
- B6: { process: "Describe monitoring cadence for social harms, incident playbooks, and retest triggers after fixes." },
746
- },
747
-
748
- "problem-solving": {
749
- A1: { benchmark: "List math/reasoning/code benchmarks used (GSM8K, MATH, HumanEval) and configuration details." },
750
- A2: { benchmark: "State numeric performance thresholds (e.g., pass rates) and how they map to acceptance criteria." },
751
- A3: { benchmark: "Provide baselines, previous-version comparisons, and statistical tests for score deltas." },
752
- A4: { benchmark: "Describe stress tests (noisy inputs, truncated context) and observed degradation rates." },
753
- A5: { benchmark: "Note any online monitoring of problem-solving failures and metrics for automated quality checks." },
754
- A6: { benchmark: "Document train/test overlap with known solution sources and contamination mitigation steps." },
755
- B1: { process: "Clarify which problem-solving claims are evaluated and define success/failure concretely." },
756
- B2: { process: "Provide replication packages (notebooks, seeds) or explain access restrictions and proxies." },
757
- B3: { process: "Record expert reviewers (domain experts, graders), rubric instructions, and adjudication rules." },
758
- B4: { process: "Include uncertainty (multi-seed variance), example failures, and full result tables for transparency." },
759
- B5: { process: "Note alignment with domain standards (education, clinical) and list any exemptions or gaps." },
760
- B6: { process: "Describe scheduled re-evals after model updates and procedures for retesting failing cases." },
761
- },
762
-
763
- // (other category question hints omitted for brevity - full set can be expanded later)
764
- }
765
-
766
- export const RECOMMENDED_BENCHMARKS: Record<string, string> = {
767
- "language-communication": "e.g., MMLU, BBH, SuperGLUE",
768
- "social-intelligence": "e.g., SocialIQA, EmoBench, PersonaChat (human-eval)",
769
- "problem-solving": "e.g., GSM8K, MATH, HumanEval",
770
- "creativity-innovation": "e.g., human preference studies, CREAM (human-eval)",
771
- "learning-memory": "e.g., few-shot transfer suites, continual-learning benchmarks",
772
- "perception-vision": "e.g., ImageNet, COCO, VQA",
773
- "physical-manipulation": "e.g., RoboSuite, YCB benchmarks, real/sim task suites",
774
- "metacognition": "e.g., calibration datasets (ECE), uncertainty benchmarks",
775
- "robotic-intelligence": "e.g., Habitat, AI2-THOR, DARPA challenge tasks",
776
- "harmful-content": "e.g., toxicity/harm benchmarks like ToxicBERT evals, red-team suites",
777
- "information-integrity": "e.g., FEVER, fact-checking datasets, hallucination benchmarks",
778
- "privacy-data": "e.g., membership-inference tests, MI challenge datasets",
779
- "bias-fairness": "e.g., fairness benchmark suites (subgroup metrics), demographic breakdown tests",
780
- "security-robustness": "e.g., adversarial robustness suites, attack-replay benchmarks",
781
- "dangerous-capabilities": "e.g., dual-use/red-team evaluation suites (internal or published)",
782
- "human-ai-interaction": "e.g., user-study protocols, SUS, human preference tests",
783
- "environmental-impact": "e.g., FLOPs/energy measurement reports, carbon accounting tests",
784
- "economic-displacement": "e.g., scenario/projection models, labor-impact analyses",
785
- "governance-accountability": "e.g., audit logs, governance checklists (process evidence)",
786
- "value-chain": "e.g., third-party audit reports, supply-chain assessments",
787
- }
788
-
789
- export const RECOMMENDED_METRICS: Record<string, string> = {
790
- "language-communication": "e.g., accuracy, F1, BLEU, ROUGE, BERTScore",
791
- "social-intelligence": "e.g., human rating scores, agreement rates, F1 for intent detection",
792
- "problem-solving": "e.g., exact-match, pass@k, accuracy, solution correctness percentage",
793
- "creativity-innovation": "e.g., human preference %, novelty/diversity scores",
794
- "learning-memory": "e.g., few-shot accuracy, retention rate, forgetting metric",
795
- "perception-vision": "e.g., mAP, IoU, top-1/top-5 accuracy",
796
- "physical-manipulation": "e.g., success rate, collision rate, completion time",
797
- "metacognition": "e.g., ECE, calibration error, confidence-accuracy correlation",
798
- "robotic-intelligence": "e.g., task success rate, path efficiency, failure modes count",
799
- "harmful-content": "e.g., toxicity rate, harmful-response rate, false negative rate for filters",
800
- "information-integrity": "e.g., precision/recall of fact-checking, citation accuracy",
801
- "privacy-data": "e.g., membership inference advantage, reconstruction error rates",
802
- "bias-fairness": "e.g., subgroup parity gaps, disparate impact ratios, statistical significance",
803
- "security-robustness": "e.g., attack success rate, robustness delta under perturbation",
804
- "dangerous-capabilities": "e.g., misuse rate under red-team prompts, severity counts",
805
- "human-ai-interaction": "e.g., SUS, task completion rate, user satisfaction scores",
806
- "environmental-impact": "e.g., energy per inference, carbon per training run",
807
- "economic-displacement": "e.g., projected job impact metrics, economic sensitivity metrics",
808
- "governance-accountability": "e.g., audit coverage %, policy alignment scoring",
809
- "value-chain": "e.g., vendor risk scores, dependency vulnerability counts",
810
- }
811
-
812
- export const defaultHints = {
813
- benchmark: "Hint: include relevant benchmark settings, scoring rules, and notable limitations.",
814
- process: "Hint: mention reviewers consulted, applicable standards/regulations, and scope limitations.",
815
- }
816
-
817
- export function getFieldPlaceholder(categoryId: string, questionId: string, field: "benchmarkName" | "metrics") {
818
- if (field === "benchmarkName") return RECOMMENDED_BENCHMARKS[categoryId] || "e.g., MMLU, HellaSwag, GSM8K"
819
- return RECOMMENDED_METRICS[categoryId] || "e.g., accuracy, F1, BLEU, perplexity"
820
- }
821
-
822
- export function getHint(categoryId: string, questionId: string, section: "benchmark" | "process") {
823
- const catQ = CATEGORY_QUESTION_HINTS[categoryId]
824
- const qHints = catQ ? catQ[questionId] : undefined
825
- if (qHints && qHints[section]) return qHints[section]
826
- if (CATEGORY_HINTS[categoryId] && CATEGORY_HINTS[categoryId][section]) return CATEGORY_HINTS[categoryId][section]
827
- return defaultHints[section]
828
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/schema.ts ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import schema from '@/schema/evaluation-schema.json'
2
+ import categoryDetails from '@/schema/category-details.json'
3
+ import formHints from '@/schema/form-hints.json'
4
+ import systemInfoSchema from '@/schema/system-info-schema.json'
5
+
6
+ export type Category = {
7
+ id: string
8
+ name: string
9
+ description: string
10
+ type: 'capability' | 'risk'
11
+ detailedGuidance?: string
12
+ }
13
+
14
+ export type Question = {
15
+ id: string
16
+ text: string
17
+ tooltip?: string
18
+ hint?: string
19
+ customFields?: string[]
20
+ }
21
+
22
+ const raw = schema as {
23
+ version: string
24
+ categories: Array<{ id: string; name: string; type: 'capability' | 'risk' }>
25
+ benchmarkQuestions: Question[]
26
+ processQuestions: Question[]
27
+ }
28
+
29
+ // Merge in descriptions from category details
30
+ const data = {
31
+ version: raw.version,
32
+ categories: raw.categories.map((c) => ({
33
+ ...c,
34
+ description: (categoryDetails.categories as any)[c.id]?.description || '',
35
+ detailedGuidance: (categoryDetails.categories as any)[c.id]?.detailedGuidance || ''
36
+ })) as Category[],
37
+ benchmarkQuestions: raw.benchmarkQuestions,
38
+ processQuestions: raw.processQuestions,
39
+ }
40
+
41
+ export function getAllCategories() {
42
+ return data.categories
43
+ }
44
+
45
+ export function getCategoryById(id: string) {
46
+ return data.categories.find((c) => c.id === id)
47
+ }
48
+
49
+ export function getBenchmarkQuestions() {
50
+ return data.benchmarkQuestions
51
+ }
52
+
53
+ export function getProcessQuestions() {
54
+ return data.processQuestions
55
+ }
56
+
57
+ // Export form utilities from schema
58
+ export const SOURCE_TYPES = formHints.sourceTypes
59
+ export const ADDITIONAL_ASPECTS_SECTION = formHints.additionalAspectsSection
60
+
61
+ // Form hint utilities from schema
62
+ export function getFieldPlaceholder(categoryId: string, field: "benchmarkName" | "metrics") {
63
+ if (field === "benchmarkName") return (formHints.recommendedBenchmarks as any)[categoryId] || "e.g., MMLU, HellaSwag, GSM8K"
64
+ return (formHints.recommendedMetrics as any)[categoryId] || "e.g., accuracy, F1, BLEU, perplexity"
65
+ }
66
+
67
+ export function getHint(categoryId: string, questionId: string, section: "benchmark" | "process") {
68
+ const catQ = (formHints.categoryQuestionHints as any)[categoryId]
69
+ const qHints = catQ ? catQ[questionId] : undefined
70
+ if (qHints && qHints[section]) return qHints[section]
71
+ if ((formHints.categoryHints as any)[categoryId] && (formHints.categoryHints as any)[categoryId][section]) return (formHints.categoryHints as any)[categoryId][section]
72
+ return (formHints.defaultHints as any)[section]
73
+ }
74
+
75
+ export function getSystemInfoFormOptions() {
76
+ return systemInfoSchema.formOptions
77
+ }
78
+
79
+ export function getSystemInfoSchema() {
80
+ return systemInfoSchema.systemInfo
81
+ }
82
+
83
+ export default data
package.json CHANGED
@@ -53,6 +53,7 @@
53
  "react-day-picker": "9.8.0",
54
  "react-dom": "^19",
55
  "react-hook-form": "^7.60.0",
 
56
  "react-resizable-panels": "^2.1.7",
57
  "recharts": "latest",
58
  "sonner": "^1.7.4",
@@ -68,8 +69,8 @@
68
  "@types/react-dom": "^19",
69
  "postcss": "^8.5",
70
  "tailwindcss": "^4.1.9",
71
- "tw-animate-css": "1.3.3",
72
- "typescript": "^5",
73
- "vitest": "^1.1.5"
74
  }
75
  }
 
53
  "react-day-picker": "9.8.0",
54
  "react-dom": "^19",
55
  "react-hook-form": "^7.60.0",
56
+ "react-is": "^19.1.1",
57
  "react-resizable-panels": "^2.1.7",
58
  "recharts": "latest",
59
  "sonner": "^1.7.4",
 
69
  "@types/react-dom": "^19",
70
  "postcss": "^8.5",
71
  "tailwindcss": "^4.1.9",
72
+ "tw-animate-css": "1.3.3",
73
+ "typescript": "^5",
74
+ "vitest": "^1.1.5"
75
  }
76
  }
pnpm-lock.yaml CHANGED
@@ -137,6 +137,9 @@ importers:
137
  react-hook-form:
138
  specifier: ^7.60.0
139
  version: 7.62.0([email protected])
 
 
 
140
  react-resizable-panels:
141
  specifier: ^2.1.7
142
 
137
  react-hook-form:
138
  specifier: ^7.60.0
139
  version: 7.62.0([email protected])
140
+ react-is:
141
+ specifier: ^19.1.1
142
+ version: 19.1.1
143
  react-resizable-panels:
144
  specifier: ^2.1.7
145
public/evaluations/claude-3-sonnet.json CHANGED
@@ -1,12 +1,17 @@
1
  {
2
  "id": "claude-3-sonnet-2024",
3
  "systemName": "Claude 3.5 Sonnet",
 
4
  "provider": "Anthropic",
5
  "version": "claude-3-5-sonnet-20241022",
6
- "modality": "text-to-text",
7
- "evaluationDate": "2024-01-10",
8
- "deploymentContext": "Production API",
9
- "evaluator": "Anthropic Safety Team",
 
 
 
 
10
  "selectedCategories": [
11
  "language-communication",
12
  "problem-solving",
@@ -33,38 +38,38 @@
33
  "totalApplicable": 20,
34
  "capabilityApplicable": 9,
35
  "riskApplicable": 11,
36
- "completenessScore": 88,
37
  "strongCategories": [
38
  "language-communication",
39
- "social-intelligence",
40
- "problem-solving",
41
  "creativity-innovation",
42
  "harmful-content",
43
- "information-integrity",
44
  "bias-fairness",
45
- "human-ai-interaction"
 
 
46
  ],
47
  "adequateCategories": [
 
48
  "learning-memory",
49
  "perception-vision",
50
  "metacognition",
51
  "privacy-data",
52
- "security-robustness",
53
- "governance-accountability"
54
  ],
55
  "weakCategories": [
56
- "physical-manipulation",
57
- "robotic-intelligence",
58
  "dangerous-capabilities",
59
- "environmental-impact"
 
60
  ],
61
  "insufficientCategories": [
62
- "economic-displacement",
 
63
  "value-chain"
64
  ],
65
  "priorityAreas": [
 
66
  "environmental-impact",
67
- "economic-displacement",
68
  "value-chain"
69
  ]
70
  },
@@ -89,29 +94,29 @@
89
  "benchmarkSources": {
90
  "A1": [
91
  {
92
- "id": "bench-meem78cs-gs27k1",
93
  "url": "https://www.anthropic.com/news/claude-3-family",
94
- "description": "Claude 3.5 Sonnet performance on language understanding benchmarks",
95
  "sourceType": "external",
96
- "benchmarkName": "MMLU, HellaSwag, ARC, WinoGrande",
97
- "metrics": "Accuracy, coherence score",
98
- "score": "88.7% on MMLU, 95.4% on HellaSwag",
99
- "version": "",
100
- "taskVariants": "",
101
  "customFields": {}
102
  }
103
  ],
104
  "A2": [
105
  {
106
- "id": "bench-meem78cs-ntiiq7",
107
  "url": "https://www.anthropic.com/safety",
108
- "description": "Constitutional AI safety evaluation results",
109
  "sourceType": "internal",
110
- "benchmarkName": "",
111
- "metrics": "",
112
- "score": "",
113
- "version": "",
114
- "taskVariants": "",
115
  "customFields": {}
116
  }
117
  ],
 
1
  {
2
  "id": "claude-3-sonnet-2024",
3
  "systemName": "Claude 3.5 Sonnet",
4
+ "url": "https://claude.ai",
5
  "provider": "Anthropic",
6
  "version": "claude-3-5-sonnet-20241022",
7
+ "modelTag": "claude-3-5-sonnet-20241022",
8
+ "knowledgeCutoff": "2024-04-01",
9
+ "modelType": "foundational",
10
+ "inputModalities": ["Text", "Image"],
11
+ "outputModalities": ["Text"],
12
+ "deploymentContexts": ["Public/Consumer-Facing", "Internal/Enterprise Use"],
13
+ "evaluationDate": "2024-11-15",
14
+ "evaluator": "Anthropic Constitutional AI Team",
15
  "selectedCategories": [
16
  "language-communication",
17
  "problem-solving",
 
38
  "totalApplicable": 20,
39
  "capabilityApplicable": 9,
40
  "riskApplicable": 11,
41
+ "completenessScore": 96,
42
  "strongCategories": [
43
  "language-communication",
44
+ "social-intelligence",
 
45
  "creativity-innovation",
46
  "harmful-content",
 
47
  "bias-fairness",
48
+ "information-integrity",
49
+ "human-ai-interaction",
50
+ "governance-accountability"
51
  ],
52
  "adequateCategories": [
53
+ "problem-solving",
54
  "learning-memory",
55
  "perception-vision",
56
  "metacognition",
57
  "privacy-data",
58
+ "security-robustness"
 
59
  ],
60
  "weakCategories": [
 
 
61
  "dangerous-capabilities",
62
+ "environmental-impact",
63
+ "economic-displacement"
64
  ],
65
  "insufficientCategories": [
66
+ "physical-manipulation",
67
+ "robotic-intelligence",
68
  "value-chain"
69
  ],
70
  "priorityAreas": [
71
+ "dangerous-capabilities",
72
  "environmental-impact",
 
73
  "value-chain"
74
  ]
75
  },
 
94
  "benchmarkSources": {
95
  "A1": [
96
  {
97
+ "id": "bench-claude35s-lc-001",
98
  "url": "https://www.anthropic.com/news/claude-3-family",
99
+ "description": "Claude 3.5 Sonnet performance across comprehensive language understanding and generation tasks",
100
  "sourceType": "external",
101
+ "benchmarkName": "MMLU, HellaSwag, ARC-Challenge, WinoGrande, GSM8K",
102
+ "metrics": "Accuracy on reasoning and knowledge tasks",
103
+ "score": "88.7% MMLU, 95.4% HellaSwag, 96.4% ARC-C, 89.0% WinoGrande, 96.4% GSM8K",
104
+ "version": "claude-3-5-sonnet-20241022",
105
+ "taskVariants": "academic-knowledge, commonsense-reasoning, math-reasoning, reading-comprehension",
106
  "customFields": {}
107
  }
108
  ],
109
  "A2": [
110
  {
111
+ "id": "bench-claude35s-lc-002",
112
  "url": "https://www.anthropic.com/safety",
113
+ "description": "Constitutional AI evaluation showing strong safety alignment and helpful refusal patterns",
114
  "sourceType": "internal",
115
+ "benchmarkName": "Anthropic HHH Eval, Constitutional AI Safety Suite",
116
+ "metrics": "Helpfulness, harmlessness, honesty scores",
117
+ "score": "92% helpfulness, 97% harmlessness, 89% honesty",
118
+ "version": "v3.5",
119
+ "taskVariants": "safety-alignment, constitutional-ai, harmlessness",
120
  "customFields": {}
121
  }
122
  ],
public/evaluations/fraud-detector.json CHANGED
@@ -1,10 +1,17 @@
1
  {
2
  "id": "fraud-detector-2024",
3
- "systemName": "FraudDetector AI",
4
- "provider": "FinTech Solutions",
5
- "modality": "text-to-text",
6
- "evaluationDate": "2024-01-05",
7
- "evaluator": "FinTech Risk Team",
 
 
 
 
 
 
 
8
  "selectedCategories": [
9
  "problem-solving",
10
  "bias-fairness",
@@ -19,6 +26,38 @@
19
  "dangerous-capabilities",
20
  "economic-displacement"
21
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "categoryEvaluations": {
23
  "problem-solving": {
24
  "benchmarkAnswers": { "A1": "yes", "A2": "N/A", "A3": "yes", "A4": "N/A", "A5": "N/A", "A6": "yes" },
@@ -26,15 +65,29 @@
26
  "benchmarkSources": {
27
  "A1": [
28
  {
29
- "id": "fd-bench-ps-1",
30
- "url": "https://fintech.example.com/reports/frauddetector/problem-solving",
31
- "description": "Internal evaluation on transaction anomaly detection benchmarks",
32
  "sourceType": "internal",
33
- "benchmarkName": "FraudBench-TS, Transaction-QA",
34
- "metrics": "Precision, recall, AUC",
35
- "score": "92% precision",
36
- "version": "1.0",
37
- "taskVariants": "financial-transactions",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "customFields": {}
39
  }
40
  ],
@@ -83,15 +136,15 @@
83
  "benchmarkSources": {
84
  "A1": [
85
  {
86
- "id": "fd-bench-bf-1",
87
- "url": "https://fintech.example.com/reports/fairness",
88
- "description": "Internal fairness audits across demographic and merchant segments",
89
  "sourceType": "internal",
90
- "benchmarkName": "FairAudit v1",
91
- "metrics": "Disparate impact, false positive rate parity",
92
- "score": "Acceptable parity across segments",
93
- "version": "1.0",
94
- "taskVariants": "transactional",
95
  "customFields": {}
96
  }
97
  ]
@@ -361,34 +414,5 @@
361
  "processSources": {},
362
  "additionalAspects": "Not applicable — detailed value-chain and supply-chain risk analysis is beyond the scope of this demo fixture."
363
  }
364
- },
365
- "overallStats": {
366
- "totalApplicable": 12,
367
- "capabilityApplicable": 4,
368
- "riskApplicable": 8,
369
- "strongCategories": [
370
- "problem-solving",
371
- "bias-fairness",
372
- "security-robustness",
373
- "governance-accountability"
374
- ],
375
- "adequateCategories": [
376
- "learning-memory",
377
- "privacy-data",
378
- "human-ai-interaction"
379
- ],
380
- "weakCategories": [
381
- "language-communication",
382
- "harmful-content",
383
- "information-integrity"
384
- ],
385
- "insufficientCategories": [
386
- "dangerous-capabilities",
387
- "economic-displacement"
388
- ],
389
- "priorityAreas": [
390
- "information-integrity",
391
- "harmful-content"
392
- ]
393
  }
394
  }
 
1
  {
2
  "id": "fraud-detector-2024",
3
+ "systemName": "FraudShield AI Pro",
4
+ "url": "https://securebank.com/fraudshield",
5
+ "provider": "SecureBank Technologies",
6
+ "version": "v3.2.1",
7
+ "modelTag": "fraudshield-v3.2.1",
8
+ "knowledgeCutoff": "2024-06-01",
9
+ "modelType": "fine-tuned",
10
+ "inputModalities": ["Tabular"],
11
+ "outputModalities": ["Tabular"],
12
+ "deploymentContexts": ["Internal/Enterprise Use", "High-Risk Applications"],
13
+ "evaluationDate": "2024-07-10",
14
+ "evaluator": "SecureBank AI Risk & Compliance Team",
15
  "selectedCategories": [
16
  "problem-solving",
17
  "bias-fairness",
 
26
  "dangerous-capabilities",
27
  "economic-displacement"
28
  ],
29
+ "overallStats": {
30
+ "totalApplicable": 12,
31
+ "capabilityApplicable": 4,
32
+ "riskApplicable": 8,
33
+ "completenessScore": 78,
34
+ "strongCategories": [
35
+ "problem-solving",
36
+ "security-robustness",
37
+ "privacy-data",
38
+ "governance-accountability"
39
+ ],
40
+ "adequateCategories": [
41
+ "learning-memory",
42
+ "language-communication",
43
+ "information-integrity"
44
+ ],
45
+ "weakCategories": [
46
+ "bias-fairness",
47
+ "human-ai-interaction",
48
+ "harmful-content"
49
+ ],
50
+ "insufficientCategories": [
51
+ "dangerous-capabilities",
52
+ "economic-displacement"
53
+ ],
54
+ "priorityAreas": [
55
+ "bias-fairness",
56
+ "human-ai-interaction",
57
+ "dangerous-capabilities",
58
+ "economic-displacement"
59
+ ]
60
+ },
61
  "categoryEvaluations": {
62
  "problem-solving": {
63
  "benchmarkAnswers": { "A1": "yes", "A2": "N/A", "A3": "yes", "A4": "N/A", "A5": "N/A", "A6": "yes" },
 
65
  "benchmarkSources": {
66
  "A1": [
67
  {
68
+ "id": "fds-bench-ps-001",
69
+ "url": "https://securebank.com/ai-research/fraudshield-evaluation",
70
+ "description": "Comprehensive evaluation on real-world transaction fraud detection and anomaly identification",
71
  "sourceType": "internal",
72
+ "benchmarkName": "SecureBank FraudBench 2024, FinCrime Detection Suite, AML Pattern Recognition",
73
+ "metrics": "Precision, Recall, F1-Score, AUC-ROC, False Positive Rate",
74
+ "score": "94.2% precision, 89.7% recall, 91.9% F1, 0.976 AUC-ROC, 0.08% FPR",
75
+ "version": "v3.2.1",
76
+ "taskVariants": "credit-card-fraud, wire-transfer-anomalies, suspicious-pattern-detection, velocity-checks",
77
+ "customFields": {}
78
+ }
79
+ ],
80
+ "A6": [
81
+ {
82
+ "id": "fds-bench-ps-002",
83
+ "url": "https://securebank.com/compliance/model-validation",
84
+ "description": "Independent model validation by third-party auditors for regulatory compliance",
85
+ "sourceType": "external",
86
+ "benchmarkName": "RegTech Model Validation Suite, OCC Model Risk Guidelines",
87
+ "metrics": "Model performance stability, governance compliance score",
88
+ "score": "Grade A model validation, 98.5% compliance score",
89
+ "version": "Q2-2024",
90
+ "taskVariants": "regulatory-compliance, model-governance, risk-assessment",
91
  "customFields": {}
92
  }
93
  ],
 
136
  "benchmarkSources": {
137
  "A1": [
138
  {
139
+ "id": "fds-bench-bf-001",
140
+ "url": "https://securebank.com/responsible-ai/fairness-assessment",
141
+ "description": "Comprehensive fairness evaluation across protected demographic groups and geographic regions",
142
  "sourceType": "internal",
143
+ "benchmarkName": "Financial Fairness Benchmark 2024, Equal Credit Opportunity Compliance Suite",
144
+ "metrics": "Demographic parity, equalized odds, false positive rate parity across groups",
145
+ "score": "0.91 demographic parity, 0.88 equalized odds, <0.05 FPR difference across groups",
146
+ "version": "v2.1",
147
+ "taskVariants": "demographic-fairness, geographic-fairness, credit-decisioning, transaction-monitoring",
148
  "customFields": {}
149
  }
150
  ]
 
414
  "processSources": {},
415
  "additionalAspects": "Not applicable — detailed value-chain and supply-chain risk analysis is beyond the scope of this demo fixture."
416
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  }
418
  }
public/evaluations/gemini-pro.json CHANGED
@@ -1,12 +1,17 @@
1
  {
2
  "id": "gemini-pro-2024",
3
  "systemName": "Gemini Pro 1.5",
 
4
  "provider": "Google",
5
- "version": "gemini-1.5-pro-001",
6
- "modality": "multimodal",
7
- "evaluationDate": "2024-01-08",
8
- "deploymentContext": "Production API",
9
- "evaluator": "Google DeepMind Safety Team",
 
 
 
 
10
  "selectedCategories": [
11
  "language-communication",
12
  "problem-solving",
@@ -33,15 +38,14 @@
33
  "totalApplicable": 20,
34
  "capabilityApplicable": 9,
35
  "riskApplicable": 11,
36
- "completenessScore": 85,
37
  "strongCategories": [
38
  "language-communication",
39
  "problem-solving",
40
  "perception-vision",
41
  "learning-memory",
42
  "information-integrity",
43
- "privacy-data",
44
- "security-robustness"
45
  ],
46
  "adequateCategories": [
47
  "social-intelligence",
@@ -49,24 +53,25 @@
49
  "metacognition",
50
  "harmful-content",
51
  "bias-fairness",
52
- "human-ai-interaction"
 
 
53
  ],
54
  "weakCategories": [
55
- "physical-manipulation",
56
- "robotic-intelligence",
57
  "dangerous-capabilities",
58
- "governance-accountability"
59
  ],
60
  "insufficientCategories": [
61
- "environmental-impact",
 
62
  "economic-displacement",
63
  "value-chain"
64
  ],
65
  "priorityAreas": [
66
- "physical-manipulation",
67
- "robotic-intelligence",
68
- "governance-accountability",
69
- "environmental-impact"
70
  ]
71
  },
72
  "categoryEvaluations": {
@@ -90,29 +95,29 @@
90
  "benchmarkSources": {
91
  "A1": [
92
  {
93
- "id": "bench-meem78cz-yf0a7e",
94
  "url": "https://deepmind.google/technologies/gemini/",
95
- "description": "Gemini Pro 1.5 language understanding benchmark results",
96
  "sourceType": "external",
97
- "benchmarkName": "MMLU, HellaSwag, ARC, GSM8K, HumanEval",
98
- "metrics": "Accuracy, reasoning score, multilingual performance",
99
- "score": "83.7% on MMLU, 87.8% on GSM8K, 71.9% on HumanEval",
100
- "version": "",
101
- "taskVariants": "",
102
  "customFields": {}
103
  }
104
  ],
105
  "A2": [
106
  {
107
- "id": "bench-meem78cz-pyfyty",
108
  "url": "https://ai.google/responsibility/",
109
- "description": "AI safety evaluation meeting Google's responsible AI principles",
110
  "sourceType": "internal",
111
- "benchmarkName": "",
112
- "metrics": "",
113
- "score": "",
114
- "version": "",
115
- "taskVariants": "",
116
  "customFields": {}
117
  }
118
  ],
@@ -346,15 +351,15 @@
346
  "benchmarkSources": {
347
  "A1": [
348
  {
349
- "id": "bench-meem78cz-pz8alo",
350
  "url": "https://deepmind.google/technologies/gemini/",
351
- "description": "Multimodal vision capabilities evaluation",
352
  "sourceType": "external",
353
- "benchmarkName": "VQA, COCO Captions, TextVQA, ChartQA, DocVQA",
354
- "metrics": "Visual understanding accuracy, multimodal reasoning score",
355
- "score": "82.3% on VQA, 88.1% on COCO Captions, 74.6% on TextVQA",
356
- "version": "",
357
- "taskVariants": "",
358
  "customFields": {}
359
  }
360
  ]
 
1
  {
2
  "id": "gemini-pro-2024",
3
  "systemName": "Gemini Pro 1.5",
4
+ "url": "https://gemini.google.com",
5
  "provider": "Google",
6
+ "version": "gemini-1.5-pro-002",
7
+ "modelTag": "gemini-1.5-pro-002",
8
+ "knowledgeCutoff": "2024-02-01",
9
+ "modelType": "foundational",
10
+ "inputModalities": ["Text", "Image", "Video", "Audio"],
11
+ "outputModalities": ["Text", "Image"],
12
+ "deploymentContexts": ["Public/Consumer-Facing", "Internal/Enterprise Use"],
13
+ "evaluationDate": "2024-08-20",
14
+ "evaluator": "Google DeepMind Safety & Responsibility Team",
15
  "selectedCategories": [
16
  "language-communication",
17
  "problem-solving",
 
38
  "totalApplicable": 20,
39
  "capabilityApplicable": 9,
40
  "riskApplicable": 11,
41
+ "completenessScore": 89,
42
  "strongCategories": [
43
  "language-communication",
44
  "problem-solving",
45
  "perception-vision",
46
  "learning-memory",
47
  "information-integrity",
48
+ "privacy-data"
 
49
  ],
50
  "adequateCategories": [
51
  "social-intelligence",
 
53
  "metacognition",
54
  "harmful-content",
55
  "bias-fairness",
56
+ "security-robustness",
57
+ "human-ai-interaction",
58
+ "governance-accountability"
59
  ],
60
  "weakCategories": [
 
 
61
  "dangerous-capabilities",
62
+ "environmental-impact"
63
  ],
64
  "insufficientCategories": [
65
+ "physical-manipulation",
66
+ "robotic-intelligence",
67
  "economic-displacement",
68
  "value-chain"
69
  ],
70
  "priorityAreas": [
71
+ "bias-fairness",
72
+ "dangerous-capabilities",
73
+ "environmental-impact",
74
+ "economic-displacement"
75
  ]
76
  },
77
  "categoryEvaluations": {
 
95
  "benchmarkSources": {
96
  "A1": [
97
  {
98
+ "id": "bench-gemini15p-lc-001",
99
  "url": "https://deepmind.google/technologies/gemini/",
100
+ "description": "Gemini Pro 1.5 comprehensive language understanding and generation evaluation",
101
  "sourceType": "external",
102
+ "benchmarkName": "MMLU, HellaSwag, ARC-Challenge, GSM8K, HumanEval, MGSM",
103
+ "metrics": "Accuracy across reasoning, knowledge, and coding tasks",
104
+ "score": "83.7% MMLU, 94.4% HellaSwag, 95.6% ARC-C, 91.7% GSM8K, 71.9% HumanEval, 88.9% MGSM",
105
+ "version": "gemini-1.5-pro-002",
106
+ "taskVariants": "academic-knowledge, math-reasoning, code-generation, multilingual-reasoning",
107
  "customFields": {}
108
  }
109
  ],
110
  "A2": [
111
  {
112
+ "id": "bench-gemini15p-lc-002",
113
  "url": "https://ai.google/responsibility/",
114
+ "description": "Safety evaluation aligned with Google's responsible AI principles and industry standards",
115
  "sourceType": "internal",
116
+ "benchmarkName": "Google Safety Eval Suite, RealToxicityPrompts",
117
+ "metrics": "Safety compliance rate, toxicity generation rate",
118
+ "score": "98.5% safety compliance, 0.2% toxicity rate",
119
+ "version": "v1.5.2",
120
+ "taskVariants": "safety-compliance, toxicity-detection, responsible-ai",
121
  "customFields": {}
122
  }
123
  ],
 
351
  "benchmarkSources": {
352
  "A1": [
353
  {
354
+ "id": "bench-gemini15p-pv-001",
355
  "url": "https://deepmind.google/technologies/gemini/",
356
+ "description": "Comprehensive multimodal vision capabilities evaluation across diverse visual tasks",
357
  "sourceType": "external",
358
+ "benchmarkName": "VQAv2, COCO Captions, TextVQA, ChartQA, DocVQA, AI2D, ScienceQA",
359
+ "metrics": "Visual question answering accuracy, image captioning quality, multimodal reasoning",
360
+ "score": "77.8% VQAv2, 88.1% COCO Captions, 74.6% TextVQA, 80.8% ChartQA, 88.1% DocVQA, 79.5% AI2D",
361
+ "version": "gemini-1.5-pro-vision-002",
362
+ "taskVariants": "visual-qa, image-captioning, chart-analysis, document-understanding, scientific-diagrams",
363
  "customFields": {}
364
  }
365
  ]
public/evaluations/gpt-4-turbo.json CHANGED
@@ -1,12 +1,17 @@
1
  {
2
  "id": "gpt-4-turbo-2024",
3
  "systemName": "GPT-4 Turbo",
 
4
  "provider": "OpenAI",
5
  "version": "gpt-4-turbo-2024-04-09",
6
- "modality": "text-to-text",
7
- "evaluationDate": "2024-01-15",
8
- "deploymentContext": "Production API",
9
- "evaluator": "AI Safety Research Team",
 
 
 
 
10
  "selectedCategories": [
11
  "language-communication",
12
  "problem-solving",
@@ -33,40 +38,40 @@
33
  "totalApplicable": 20,
34
  "capabilityApplicable": 9,
35
  "riskApplicable": 11,
36
- "completenessScore": 92,
37
  "strongCategories": [
38
  "language-communication",
39
- "problem-solving",
40
  "creativity-innovation",
41
  "learning-memory",
42
- "harmful-content",
43
  "information-integrity",
44
- "privacy-data"
45
  ],
46
  "adequateCategories": [
47
  "social-intelligence",
48
  "perception-vision",
49
  "metacognition",
 
50
  "bias-fairness",
 
51
  "security-robustness",
52
- "human-ai-interaction",
53
- "governance-accountability"
54
  ],
55
  "weakCategories": [
56
- "physical-manipulation",
57
- "robotic-intelligence",
58
  "dangerous-capabilities",
59
- "environmental-impact",
60
- "economic-displacement"
61
  ],
62
  "insufficientCategories": [
63
- "value-chain"
64
- ],
65
- "priorityAreas": [
66
  "physical-manipulation",
67
  "robotic-intelligence",
 
 
 
 
 
68
  "dangerous-capabilities",
69
- "value-chain"
 
70
  ]
71
  },
72
  "categoryEvaluations": {
@@ -90,29 +95,29 @@
90
  "benchmarkSources": {
91
  "A1": [
92
  {
93
- "id": "bench-meem78d1-spb1r7",
94
  "url": "https://openai.com/research/gpt-4",
95
- "description": "GPT-4 technical report showing performance on language understanding benchmarks",
96
  "sourceType": "external",
97
- "benchmarkName": "MMLU, HellaSwag, ARC",
98
- "metrics": "Accuracy, F1-score",
99
- "score": "87.4% on MMLU",
100
- "version": "",
101
- "taskVariants": "",
102
  "customFields": {}
103
  }
104
  ],
105
  "A2": [
106
  {
107
- "id": "bench-meem78d1-9tw9cj",
108
- "url": "https://openai.com/safety/gpt-4",
109
- "description": "Safety evaluation results meeting regulatory standards for language models",
110
  "sourceType": "internal",
111
- "benchmarkName": "",
112
- "metrics": "",
113
- "score": "",
114
- "version": "",
115
- "taskVariants": "",
116
  "customFields": {}
117
  }
118
  ],
 
1
  {
2
  "id": "gpt-4-turbo-2024",
3
  "systemName": "GPT-4 Turbo",
4
+ "url": "https://openai.com/gpt-4",
5
  "provider": "OpenAI",
6
  "version": "gpt-4-turbo-2024-04-09",
7
+ "modelTag": "gpt-4-turbo-2024-04-09",
8
+ "knowledgeCutoff": "2024-04-01",
9
+ "modelType": "foundational",
10
+ "inputModalities": ["Text", "Image"],
11
+ "outputModalities": ["Text"],
12
+ "deploymentContexts": ["Public/Consumer-Facing", "Internal/Enterprise Use", "High-Risk Applications"],
13
+ "evaluationDate": "2024-05-15",
14
+ "evaluator": "OpenAI Safety & Alignment Team",
15
  "selectedCategories": [
16
  "language-communication",
17
  "problem-solving",
 
38
  "totalApplicable": 20,
39
  "capabilityApplicable": 9,
40
  "riskApplicable": 11,
41
+ "completenessScore": 94,
42
  "strongCategories": [
43
  "language-communication",
44
+ "problem-solving",
45
  "creativity-innovation",
46
  "learning-memory",
 
47
  "information-integrity",
48
+ "governance-accountability"
49
  ],
50
  "adequateCategories": [
51
  "social-intelligence",
52
  "perception-vision",
53
  "metacognition",
54
+ "harmful-content",
55
  "bias-fairness",
56
+ "privacy-data",
57
  "security-robustness",
58
+ "human-ai-interaction"
 
59
  ],
60
  "weakCategories": [
 
 
61
  "dangerous-capabilities",
62
+ "environmental-impact"
 
63
  ],
64
  "insufficientCategories": [
 
 
 
65
  "physical-manipulation",
66
  "robotic-intelligence",
67
+ "value-chain",
68
+ "economic-displacement"
69
+ ],
70
+ "priorityAreas": [
71
+ "bias-fairness",
72
  "dangerous-capabilities",
73
+ "environmental-impact",
74
+ "economic-displacement"
75
  ]
76
  },
77
  "categoryEvaluations": {
 
95
  "benchmarkSources": {
96
  "A1": [
97
  {
98
+ "id": "bench-gpt4t-lc-001",
99
  "url": "https://openai.com/research/gpt-4",
100
+ "description": "GPT-4 Turbo performance on comprehensive language understanding and generation benchmarks",
101
  "sourceType": "external",
102
+ "benchmarkName": "MMLU, HellaSwag, ARC-Challenge, WinoGrande",
103
+ "metrics": "Accuracy across multiple choice and generation tasks",
104
+ "score": "87.4% MMLU, 95.3% HellaSwag, 96.3% ARC-C, 87.5% WinoGrande",
105
+ "version": "v2024.04",
106
+ "taskVariants": "academic-knowledge, commonsense-reasoning, reading-comprehension",
107
  "customFields": {}
108
  }
109
  ],
110
  "A2": [
111
  {
112
+ "id": "bench-gpt4t-lc-002",
113
+ "url": "https://openai.com/safety/gpt-4-system-card",
114
+ "description": "Safety evaluation demonstrating appropriate refusal of harmful content while maintaining helpfulness",
115
  "sourceType": "internal",
116
+ "benchmarkName": "OpenAI Safety Eval Suite, TruthfulQA",
117
+ "metrics": "Refusal rate on harmful prompts, truthfulness score",
118
+ "score": "99.1% harmful refusal rate, 83% truthfulness",
119
+ "version": "v1.2",
120
+ "taskVariants": "safety-alignment, truthfulness",
121
  "customFields": {}
122
  }
123
  ],
schema/category-details.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "categories": {
4
+ "language-communication": {
5
+ "name": "Language & Communication",
6
+ "description": "Evaluates the system's ability to understand, generate, and engage in natural language communication across various contexts, languages, and communication styles.",
7
+ "type": "capability",
8
+ "detailedGuidance": "Key Benchmarks to Look For:\nGeneral: MMLU, HellaSwag, ARC, WinoGrande\nReading Comprehension: SQuAD, QuAC, CoQA\nLanguage Generation: BLEU, ROUGE, BERTScore\nMultilingual: XTREME, XGLUE, mBERT evaluation\nReasoning: GSM8K, BBH (BIG-Bench Hard)\nInstruction Following: Alpaca Eval, MT-Bench\n\nEvaluation Focus:\n• Semantic understanding across languages\n• Text generation quality and coherence\n• Reasoning and logical inference\n• Context retention in long conversations\n• Factual accuracy and knowledge recall\n\nCommon Risk Areas:\n• Hallucination and misinformation generation\n• Bias in language generation\n• Inconsistent performance across languages"
9
+ },
10
+ "social-intelligence": {
11
+ "name": "Social Intelligence & Interaction",
12
+ "description": "Assesses the system's capacity to understand social contexts, interpret human emotions and intentions, and engage appropriately in social interactions.",
13
+ "type": "capability",
14
+ "detailedGuidance": "Key Benchmarks to Look For:\nTheory of Mind: ToMi, FaINoM, SOTOPIA\nEmotional Intelligence: EmoBench, EQBench\nSocial Reasoning: Social IQa, CommonsenseQA\nDialogue: PersonaChat, BlendedSkillTalk\nPsychology: Psychometrics Benchmark for LLMs\n\nEvaluation Focus:\n• Understanding social cues and context\n• Appropriate emotional responses\n• Maintaining consistent personality\n• Theory of mind reasoning\n• Cultural sensitivity and awareness\n\nCommon Risk Areas:\n• Inappropriate anthropomorphization\n• Cultural bias and insensitivity\n• Lack of emotional regulation\n• Manipulation potential"
15
+ },
16
+ "problem-solving": {
17
+ "name": "Problem Solving",
18
+ "description": "Measures the system's ability to analyze complex problems, develop solutions, and apply reasoning across various domains and contexts.",
19
+ "type": "capability",
20
+ "detailedGuidance": "Key Benchmarks to Look For:\nMathematical: GSM8K, MATH, FrontierMath, AIME\nLogical Reasoning: LogiQA, ReClor, FOLIO\nProgramming: HumanEval, MBPP, SWE-bench\nScientific: SciQ, ScienceQA\nMulti-step: StrategyQA, DROP, QuALITY\n\nEvaluation Focus:\n• Multi-step reasoning capability\n• Mathematical and logical problem solving\n• Code generation and debugging\n• Scientific and analytical thinking\n• Planning and strategy development\n\nCommon Risk Areas:\n• Reasoning errors in complex problems\n• Inconsistent problem-solving approaches\n• Inability to show work or explain reasoning"
21
+ },
22
+ "creativity-innovation": {
23
+ "name": "Creativity & Innovation",
24
+ "description": "Evaluates the system's capacity for creative thinking, generating novel ideas, and producing original content across different creative domains.",
25
+ "type": "capability",
26
+ "detailedGuidance": "Key Benchmarks to Look For:\nCreative Writing: CREAM, Creative Story Generation\nVisual Creativity: FIQ (Figural Interpretation Quest)\nAlternative Uses: AUT (Alternative Uses Task)\nArtistic Generation: Aesthetic and originality scoring\nInnovation: Novel solution generation tasks\n\nEvaluation Focus:\n• Originality and novelty of outputs\n• Artistic and creative quality\n• Ability to combine concepts innovatively\n• Divergent thinking capabilities\n• Value and usefulness of creative outputs\n\nCommon Risk Areas:\n• Copyright and IP infringement\n• Lack of genuine creativity vs. recombination\n• Inappropriate or harmful creative content"
27
+ },
28
+ "learning-memory": {
29
+ "name": "Learning & Memory",
30
+ "description": "Assesses the system's ability to acquire new knowledge, retain information, and adapt behavior based on experience and feedback.",
31
+ "type": "capability",
32
+ "detailedGuidance": "Key Benchmarks to Look For:\nFew-shot Learning: Omniglot, miniImageNet, Meta-Dataset\nTransfer Learning: VTAB, BigTransfer\nIn-context Learning: ICL benchmarks across domains\nKnowledge Retention: Long-term memory tests\nContinual Learning: CORe50, Split-CIFAR\n\nEvaluation Focus:\n• Few-shot and zero-shot learning ability\n• Knowledge transfer across domains\n• Memory retention and recall\n• Adaptation to new tasks\n• Learning efficiency and speed\n\nCommon Risk Areas:\n• Catastrophic forgetting\n• Overfitting to limited examples\n• Inability to generalize learned concepts"
33
+ },
34
+ "perception-vision": {
35
+ "name": "Perception & Vision",
36
+ "description": "Measures the system's capability to process, interpret, and understand visual information, images, and spatial relationships.",
37
+ "type": "capability",
38
+ "detailedGuidance": "Key Benchmarks to Look For:\nObject Recognition: ImageNet, COCO, Open Images\nScene Understanding: ADE20K, Cityscapes\nRobustness: ImageNet-C, ImageNet-A\nMultimodal: VQA, CLIP benchmarks\n3D Understanding: NYU Depth, KITTI\n\nEvaluation Focus:\n• Object detection and classification\n• Scene understanding and segmentation\n• Robustness to visual variations\n• Integration with language understanding\n• Real-world deployment performance\n\nCommon Risk Areas:\n• Adversarial vulnerability\n• Bias in image recognition\n• Poor performance on edge cases"
39
+ },
40
+ "physical-manipulation": {
41
+ "name": "Physical Manipulation & Motor Skills",
42
+ "description": "Evaluates the system's ability to control physical actuators, manipulate objects, and perform motor tasks in physical environments.",
43
+ "type": "capability",
44
+ "detailedGuidance": "Key Benchmarks to Look For:\nGrasping: YCB Object Set, Functional Grasping\nManipulation: RoboCAS, FMB (Functional Manipulation)\nAssembly: NIST Assembly Task Boards\nNavigation: Habitat, AI2-THOR challenges\nDexterity: Dexterous manipulation benchmarks\n\nEvaluation Focus:\n• Grasping and manipulation accuracy\n• Adaptability to object variations\n• Force control and delicate handling\n• Spatial reasoning and planning\n• Real-world deployment robustness\n\nCommon Risk Areas:\n• Safety in human environments\n• Damage to objects or environment\n• Inconsistent performance across conditions"
45
+ },
46
+ "metacognition": {
47
+ "name": "Metacognition & Self-Awareness",
48
+ "description": "Assesses the system's ability to understand its own capabilities, limitations, and reasoning processes, including self-reflection and meta-learning.",
49
+ "type": "capability",
50
+ "detailedGuidance": "Key Benchmarks to Look For:\nConfidence Calibration: Calibration metrics, ECE\nUncertainty Quantification: UQ benchmarks\nSelf-Assessment: Metacognitive accuracy tests\nKnow-Unknown: Known Unknowns benchmarks\nError Detection: Self-correction capabilities\n\nEvaluation Focus:\n• Confidence calibration accuracy\n• Uncertainty expression and quantification\n• Self-monitoring and error detection\n• Knowledge boundary awareness\n• Adaptive reasoning based on confidence\n\nCommon Risk Areas:\n• Overconfidence in incorrect responses\n• Poor uncertainty quantification\n• Inability to recognize knowledge limits"
51
+ },
52
+ "robotic-intelligence": {
53
+ "name": "Robotic Intelligence & Autonomy",
54
+ "description": "Measures autonomous decision-making, navigation, and task execution in physical environments with minimal human intervention.",
55
+ "type": "capability",
56
+ "detailedGuidance": "Key Benchmarks to Look For:\nIntegrated Tasks: RoboCup, DARPA challenges\nNavigation: Habitat challenges, real-world navigation\nManipulation: Integrated pick-and-place scenarios\nHuman-Robot Interaction: HRI benchmarks\nAutonomy: Long-horizon task completion\n\nEvaluation Focus:\n• Integrated sensorimotor capabilities\n• Autonomous decision-making\n• Adaptability to dynamic environments\n• Human-robot collaboration\n• Long-term task execution\n\nCommon Risk Areas:\n• Safety in unstructured environments\n• Unpredictable autonomous behavior\n• Failure to handle edge cases"
57
+ },
58
+ "harmful-content": {
59
+ "name": "Harmful Content Generation",
60
+ "description": "Evaluates risks related to generating harmful, offensive, illegal, or inappropriate content that could cause psychological, social, or physical harm.",
61
+ "type": "risk",
62
+ "detailedGuidance": "Key Evaluations to Look For:\nSafety Benchmarks: AIR-Bench, MLCommons AI Safety\nRed Teaming: Anthropic Constitutional AI, HarmBench\nContent Filtering: Jigsaw Toxic Comments, HASOC\nAdversarial: Jailbreaking attempts, prompt injection\nRegulatory: NIST AI RMF compliance\n\nEvaluation Focus:\n• Refusal to generate harmful content\n• Robustness against adversarial prompts\n• Content filtering effectiveness\n• Detection of subtle harmful content\n• Consistency across different prompt styles\n\nCritical Risk Areas:\n• Violence and self-harm content\n• Hate speech and discrimination\n• Illegal activity instructions\n• NSFW and inappropriate content"
63
+ },
64
+ "information-integrity": {
65
+ "name": "Information Integrity & Misinformation",
66
+ "description": "Assesses risks of generating false, misleading, or manipulated information that could undermine trust in information systems and decision-making.",
67
+ "type": "risk",
68
+ "detailedGuidance": "Key Evaluations to Look For:\nFactuality: TruthfulQA, FEVER, HaluEval\nHallucination Detection: SelfCheckGPT, FActScore\nMisinformation: LIAR dataset, fake news detection\nCitation Accuracy: Citation verification benchmarks\nSource Attribution: Provenance tracking tests\n\nEvaluation Focus:\n• Factual accuracy of generated content\n• Hallucination rate and detection\n• Proper source attribution\n• Misinformation resistance\n• Consistency across related queries\n\nCritical Risk Areas:\n• Medical misinformation\n• Political disinformation\n• False historical claims\n• Fabricated citations"
69
+ },
70
+ "privacy-data": {
71
+ "name": "Privacy & Data Protection",
72
+ "description": "Evaluates risks to personal privacy, data security, and unauthorized access to or misuse of sensitive personal information.",
73
+ "type": "risk",
74
+ "detailedGuidance": "Key Evaluations to Look For:\nMembership Inference: MIA benchmarks, CopyMark\nData Extraction: Training data extraction tests\nPII Detection: Personal information leakage tests\nAnonymization: De-identification benchmarks\nGDPR Compliance: Right to be forgotten tests\n\nEvaluation Focus:\n• Training data memorization\n• PII leakage prevention\n• Membership inference resistance\n• Data anonymization effectiveness\n• Compliance with privacy regulations\n\nCritical Risk Areas:\n• Personal information exposure\n• Training data memorization\n• Inference of sensitive attributes\n• Non-consensual data use"
75
+ },
76
+ "bias-fairness": {
77
+ "name": "Bias & Fairness",
78
+ "description": "Assesses risks of discriminatory outcomes, unfair treatment of different groups, and perpetuation of societal biases and inequalities.",
79
+ "type": "risk",
80
+ "detailedGuidance": "Key Evaluations to Look For:\nBias Benchmarks: Winogender, CrowS-Pairs, BOLD\nFairness Metrics: AI Fairness 360, Fairlearn\nDemographic Bias: Representation across groups\nIntersectional: Multi-dimensional bias analysis\nAllocative Fairness: Resource distribution equity\n\nEvaluation Focus:\n• Demographic representation fairness\n• Performance equity across groups\n• Intersectional bias analysis\n• Harmful stereotype perpetuation\n• Allocative fairness in decisions\n\nCritical Risk Areas:\n• Employment discrimination\n• Healthcare disparities\n• Educational bias\n• Criminal justice bias"
81
+ },
82
+ "security-robustness": {
83
+ "name": "Security & Robustness",
84
+ "description": "Evaluates vulnerabilities to adversarial attacks, system manipulation, and failure modes that could compromise system integrity and reliability.",
85
+ "type": "risk",
86
+ "detailedGuidance": "Key Evaluations to Look For:\nAdversarial Robustness: AdvBench, RobustBench\nPrompt Injection: AgentDojo, prompt injection tests\nModel Extraction: Model theft resistance\nBackdoor Detection: Trojaned model detection\nOWASP LLM Top 10: Security vulnerability assessment\n\nEvaluation Focus:\n• Adversarial attack resistance\n• Prompt injection robustness\n• Model extraction protection\n• Backdoor and trojan detection\n• Input validation effectiveness\n\nCritical Risk Areas:\n• Prompt injection attacks\n• Model theft and extraction\n• Adversarial examples\n• Supply chain attacks"
87
+ },
88
+ "dangerous-capabilities": {
89
+ "name": "Dangerous Capabilities & Misuse",
90
+ "description": "Assesses risks from capabilities that could be misused for harmful purposes, including dual-use applications and potential for weaponization.",
91
+ "type": "risk",
92
+ "detailedGuidance": "Key Evaluations to Look For:\nCBRN Assessment: WMD information evaluation\nDual-Use: Misuse potential analysis\nCyber Capabilities: Offensive cyber evaluation\nWeapons Information: Dangerous instruction filtering\nGovernment Protocols: AISI, NIST evaluation standards\n\nEvaluation Focus:\n• CBRN information filtering\n• Dual-use technology assessment\n• Offensive capability evaluation\n• Dangerous instruction refusal\n• Misuse potential quantification\n\nCritical Risk Areas:\n• Chemical/biological weapons info\n• Cyber attack capabilities\n• Physical harm instructions\n• Illegal activity facilitation"
93
+ },
94
+ "human-ai-interaction": {
95
+ "name": "Human-AI Interaction Risks",
96
+ "description": "Evaluates risks arising from human-AI interaction patterns, including over-reliance, manipulation, and degradation of human skills and autonomy.",
97
+ "type": "risk",
98
+ "detailedGuidance": "Key Evaluations to Look For:\nTrust Calibration: Trust-LLM, reliance calibration metrics\nManipulation Detection: Emotional manipulation detection benchmarks\nAnthropomorphism: Human-likeness perception studies\nSafety in Dialogue: HAX, RealToxicityPrompts\nUser Guidance: Task adherence and guidance clarity tests\n\nEvaluation Focus:\n• Preventing over-reliance on AI\n• Avoiding deceptive or manipulative responses\n• Maintaining transparency about capabilities and limitations\n• Providing safe, non-coercive interactions\n• Ensuring user agency and decision-making control\n\nCritical Risk Areas:\n• Emotional manipulation\n• Excessive trust leading to poor decisions\n• Misrepresentation of capabilities\n• Encouraging harmful behaviors"
99
+ },
100
+ "environmental-impact": {
101
+ "name": "Environmental & Resource Impact",
102
+ "description": "Evaluates environmental costs of AI development and deployment, including energy consumption, carbon footprint, and resource utilization.",
103
+ "type": "risk",
104
+ "detailedGuidance": "Key Evaluations to Look For:\nEnergy Usage: Carbon footprint estimation tools\nSustainability Metrics: Green AI benchmarks\nModel Efficiency: Inference cost evaluations\nHardware Utilization: Resource optimization tests\nLifecycle Assessment: Full training-to-deployment impact analysis\n\nEvaluation Focus:\n• Measuring carbon footprint and energy use\n• Optimizing for efficiency without performance loss\n• Assessing environmental trade-offs\n• Promoting sustainable deployment strategies\n\nCritical Risk Areas:\n• High carbon emissions from training\n• Excessive energy use in inference\n• Lack of transparency in environmental reporting"
105
+ },
106
+ "economic-displacement": {
107
+ "name": "Economic & Labor Displacement",
108
+ "description": "Evaluates potential economic disruption, job displacement, and impacts on labor markets and economic inequality from AI deployment.",
109
+ "type": "risk",
110
+ "detailedGuidance": "Key Evaluations to Look For:\nJob Impact Studies: Task automation potential assessments\nMarket Disruption: Industry-specific displacement projections\nEconomic Modeling: Macro and microeconomic simulations\nSkill Shift Analysis: Required workforce retraining benchmarks\nSocietal Impact: Equitable distribution of economic benefits\n\nEvaluation Focus:\n• Predicting job displacement risks\n• Identifying emerging job opportunities\n• Understanding shifts in skill demand\n• Balancing automation benefits with societal costs\n\nCritical Risk Areas:\n• Large-scale unemployment\n• Wage suppression\n• Economic inequality"
111
+ },
112
+ "governance-accountability": {
113
+ "name": "Governance & Accountability",
114
+ "description": "Assesses risks related to lack of oversight, unclear responsibility structures, and insufficient governance mechanisms for AI systems.",
115
+ "type": "risk",
116
+ "detailedGuidance": "Key Evaluations to Look For:\nTransparency: Model card completeness, datasheet reporting\nAuditability: Traceability of decisions\nOversight Mechanisms: Compliance with governance frameworks\nResponsibility Assignment: Clear chain of accountability\nStandards Compliance: ISO, IEEE AI standards adherence\n\nEvaluation Focus:\n• Establishing clear accountability\n• Ensuring decision traceability\n• Meeting compliance and ethical guidelines\n• Maintaining transparency across lifecycle\n\nCritical Risk Areas:\n• Lack of oversight\n• Unclear responsibility in failures\n• Insufficient transparency"
117
+ },
118
+ "value-chain": {
119
+ "name": "Value Chain & Supply Chain Risks",
120
+ "description": "Evaluates risks throughout the AI development and deployment pipeline, including data sourcing, model training, and third-party dependencies.",
121
+ "type": "risk",
122
+ "detailedGuidance": "Key Evaluations to Look For:\nProvenance Tracking: Dataset and component origin verification\nThird-Party Risk Assessment: Vendor dependency evaluations\nSupply Chain Security: Software and hardware integrity checks\nIntegration Testing: Risk assessment in system integration\nTraceability: End-to-end component documentation\n\nEvaluation Focus:\n• Managing third-party dependencies\n• Verifying component provenance\n• Securing the supply chain\n• Mitigating integration risks\n\nCritical Risk Areas:\n• Compromised third-party components\n• Data provenance issues\n• Vendor lock-in and dependency risks"
123
+ }
124
+ }
125
+ }
schema/evaluation-schema.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "categories": [
4
+ {"id":"language-communication","name":"Language & Communication","type":"capability"},
5
+ {"id":"social-intelligence","name":"Social Intelligence & Interaction","type":"capability"},
6
+ {"id":"problem-solving","name":"Problem Solving","type":"capability"},
7
+ {"id":"creativity-innovation","name":"Creativity & Innovation","type":"capability"},
8
+ {"id":"learning-memory","name":"Learning & Memory","type":"capability"},
9
+ {"id":"perception-vision","name":"Perception & Vision","type":"capability"},
10
+ {"id":"physical-manipulation","name":"Physical Manipulation & Motor Skills","type":"capability"},
11
+ {"id":"metacognition","name":"Metacognition & Self-Awareness","type":"capability"},
12
+ {"id":"robotic-intelligence","name":"Robotic Intelligence & Autonomy","type":"capability"},
13
+ {"id":"harmful-content","name":"Harmful Content Generation","type":"risk"},
14
+ {"id":"information-integrity","name":"Information Integrity & Misinformation","type":"risk"},
15
+ {"id":"privacy-data","name":"Privacy & Data Protection","type":"risk"},
16
+ {"id":"bias-fairness","name":"Bias & Fairness","type":"risk"},
17
+ {"id":"security-robustness","name":"Security & Robustness","type":"risk"},
18
+ {"id":"dangerous-capabilities","name":"Dangerous Capabilities & Misuse","type":"risk"},
19
+ {"id":"human-ai-interaction","name":"Human-AI Interaction Risks","type":"risk"},
20
+ {"id":"environmental-impact","name":"Environmental & Resource Impact","type":"risk"},
21
+ {"id":"economic-displacement","name":"Economic & Labor Displacement","type":"risk"},
22
+ {"id":"governance-accountability","name":"Governance & Accountability","type":"risk"},
23
+ {"id":"value-chain","name":"Value Chain & Supply Chain Risks","type":"risk"}
24
+ ],
25
+ "benchmarkQuestions": [
26
+ {
27
+ "id":"A1",
28
+ "text":"Has the system been run on recognized, category-specific benchmarks?",
29
+ "tooltip":"Expect: Benchmark/dataset names & versions, task variants, metric definitions, who ran them (internal/external).",
30
+ "hint":"List benchmarks, dataset versions and who executed them."
31
+ },
32
+ {
33
+ "id":"A2",
34
+ "text":"Does the system meet pre-set quantitative thresholds for acceptable performance under applicable regulations?",
35
+ "tooltip":"Expect: Numeric scores vs. regulatory/compliance thresholds (e.g., hiring fairness, medical accuracy), source of regulatory requirements, compliance determination.",
36
+ "hint":"Provide numeric scores and regulatory mappings."
37
+ },
38
+ {
39
+ "id":"A3",
40
+ "text":"How does performance compare to baselines, SOTA, previous versions, and other comparable systems?",
41
+ "tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
42
+ "hint":"Provide comparative scores and targets."
43
+ },
44
+ {
45
+ "id":"A4",
46
+ "text":"How does the system perform under adversarial inputs, extreme loads, distribution shift?",
47
+ "tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
48
+ "hint":"Describe stress tests and observed failure rates."
49
+ },
50
+ {
51
+ "id":"A5",
52
+ "text":"Is performance measured in the wild with automated monitors?",
53
+ "tooltip":"Expect: Live metrics tracked (e.g., error rates, drift, latency), sampling cadence, alert thresholds.",
54
+ "hint":"List live metrics and alerting rules."
55
+ },
56
+ {
57
+ "id":"A6",
58
+ "text":"Have you quantified train–test overlap or leakage risks that could inflate results?",
59
+ "tooltip":"Expect: Procedure (e.g., n‑gram/fuzzy overlap, URL hashing), contamination rate estimates, mitigations taken.",
60
+ "hint":"Report contamination checks and mitigation steps."
61
+ }
62
+ ],
63
+ "processQuestions": [
64
+ {
65
+ "id":"B1",
66
+ "text":"What capability/risk claims is this category evaluating and why it's applicable?",
67
+ "tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
68
+ "hint":"Define scope and hypotheses."
69
+ },
70
+ {
71
+ "id":"B2",
72
+ "text":"Can others reproduce the results?",
73
+ "tooltip":"Expect: Public or access-controlled release of code/configs, prompts, seeds, decoding settings, dataset IDs/versions, hardware notes; if not shareable, documented proxies.",
74
+ "hint":"Point to code, data or proxies to reproduce results."
75
+ },
76
+ {
77
+ "id":"B3",
78
+ "text":"Have domain experts/affected users reviewed interpretations of results?",
79
+ "tooltip":"Expect: Who reviewed, what feedback changed, unresolved disagreements and rationale.",
80
+ "hint":"List reviewers and key feedback."
81
+ },
82
+ {
83
+ "id":"B4",
84
+ "text":"Do figures communicate results without distortion and with uncertainty/context?",
85
+ "tooltip":"Expect: Uncertainty shown (CI/SE, multi-seed variance), full/consistent axes, sample sizes, like-for-like comparisons, raw tables available, disclosure of selection criteria.",
86
+ "hint":"Ensure figures include uncertainty disclosures."
87
+ },
88
+ {
89
+ "id":"B5",
90
+ "text":"Standards & Compliance Alignment - Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
91
+ "tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
92
+ "hint":"Map evaluation practices to standards and note gaps."
93
+ },
94
+ {
95
+ "id":"B6",
96
+ "text":"Is there a process to re-run/adapt evals as models, data, or risks change, including mitigation and retest procedures?",
97
+ "tooltip":"Expect: Triggers (model updates, drift, incidents), versioned eval specs, scheduled re-assessment cadence, audit trail of changes, mitigation protocols when issues are found, and systematic retest procedures after fixes.",
98
+ "hint":"Describe triggers and re-evaluation procedures."
99
+ }
100
+ ],
101
+ "benchmarkSourceFields": [
102
+ {"name":"benchmarkName","label":"Benchmark/Dataset Name","type":"text","placeholder":"e.g., MMLU v1"},
103
+ {"name":"version","label":"Version","type":"text","placeholder":"e.g., v1.2"},
104
+ {"name":"taskVariants","label":"Task Variants","type":"text","placeholder":"e.g., multiple choice, generation"},
105
+ {"name":"metrics","label":"Metrics","type":"text","placeholder":"e.g., accuracy, F1"},
106
+ {"name":"url","label":"URL","type":"text","placeholder":"https://..."},
107
+ {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the benchmark or test"},
108
+ {"name":"sourceType","label":"Source Type","type":"radio","options":[{"value":"internal","label":"Internal"},{"value":"external","label":"External"},{"value":"cooperative","label":"Cooperative"}]},
109
+ {"name":"score","label":"Score","type":"text","placeholder":"e.g., 85%"},
110
+ {"name":"confidenceInterval","label":"Confidence Interval","type":"text","placeholder":"e.g., 95% CI [90,94]"}
111
+ ],
112
+ "processSourceFields": [
113
+ {"name":"url","label":"URL","type":"text","placeholder":"https://..."},
114
+ {"name":"documentType","label":"Document Type","type":"text","placeholder":"e.g., Policy, Procedure"},
115
+ {"name":"title","label":"Title","type":"text","placeholder":"Document title"},
116
+ {"name":"author","label":"Author","type":"text","placeholder":"Author or owner"},
117
+ {"name":"organization","label":"Organization","type":"text","placeholder":"Owning org"},
118
+ {"name":"date","label":"Date","type":"text","placeholder":"YYYY-MM-DD"},
119
+ {"name":"description","label":"Description","type":"textarea","placeholder":"Describe the documentation"}
120
+ ]
121
+ }
schema/form-hints.json ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sourceTypes": {
3
+ "internal": {
4
+ "label": "Internal",
5
+ "description": "Evaluations conducted by the organization developing or deploying the AI system using internal resources, teams, and methodologies."
6
+ },
7
+ "external": {
8
+ "label": "External",
9
+ "description": "Independent evaluations conducted by third-party organizations, academic institutions, or external auditors without direct involvement from the developing organization."
10
+ },
11
+ "cooperative": {
12
+ "label": "Cooperative",
13
+ "description": "Collaborative evaluations involving multiple stakeholders, including the developing organization, external experts, affected communities, and regulatory bodies working together."
14
+ }
15
+ },
16
+ "additionalAspectsSection": {
17
+ "id": "C",
18
+ "title": "Additional Evaluation Aspects",
19
+ "description": "Document any other evaluation aspects for this category that may not have been captured by the structured questions above. This section will not be scored but will be visible in the final documentation."
20
+ },
21
+ "categoryHints": {
22
+ "language-communication": {
23
+ "benchmark": "Hint: mention benchmarks for language understanding/generation, prompt settings, multilingual splits, and whether factuality checks were performed.",
24
+ "process": "Hint: note consulted linguists or annotators, dataset provenance concerns, and any applicable content/regulatory considerations."
25
+ },
26
+ "social-intelligence": {
27
+ "benchmark": "Hint: mention emotion/social reasoning benchmarks used, annotator protocols, and demographic coverage.",
28
+ "process": "Hint: list consulted domain experts (psychologists, sociologists), user study details, and consent/ethics notes."
29
+ },
30
+ "problem-solving": {
31
+ "benchmark": "Hint: list math/programming/reasoning benchmarks, scoring rules, and seed/temperature settings.",
32
+ "process": "Hint: note expert reviewers, validation of solutions, and how ambiguous answers were adjudicated."
33
+ },
34
+ "creativity-innovation": {
35
+ "benchmark": "Hint: mention creative evaluation setups, human rating protocols, and diversity of prompts/tasks.",
36
+ "process": "Hint: note creative experts or juries consulted, copyright/IP checks, and content filtering policies."
37
+ },
38
+ "learning-memory": {
39
+ "benchmark": "Hint: indicate few-shot/transfer benchmarks, replay/continual learning setups, and sample sizes.",
40
+ "process": "Hint: describe retention tests, dataset refresh cadence, and any contamination checks performed."
41
+ },
42
+ "perception-vision": {
43
+ "benchmark": "Hint: list vision datasets, augmentation/robustness tests, and evaluation resolutions/settings.",
44
+ "process": "Hint: note labelling protocols, demographic coverage of imagery, and reviewer/ethical considerations."
45
+ },
46
+ "physical-manipulation": {
47
+ "benchmark": "Hint: mention robotics tasks, real/sim evaluation conditions, and safety/collision metrics.",
48
+ "process": "Hint: include safety review notes, field test observers, and incident mitigation procedures."
49
+ },
50
+ "metacognition": {
51
+ "benchmark": "Hint: report calibration metrics, uncertainty quantification methods, and multi-seed variance.",
52
+ "process": "Hint: list reviewers who evaluated uncertainty reporting and any user-facing confidence disclosures."
53
+ },
54
+ "robotic-intelligence": {
55
+ "benchmark": "Hint: note integrated task suites, sim-to-real gaps, and hardware/configuration details.",
56
+ "process": "Hint: document safety reviews, human-in-the-loop safeguards, and autonomy limits."
57
+ },
58
+ "harmful-content": {
59
+ "benchmark": "Hint: describe toxicity/harm benchmarks, prompt hardening, and red-team scenarios used.",
60
+ "process": "Hint: list safety reviewers, incident response plans, and content moderation policies referenced."
61
+ },
62
+ "information-integrity": {
63
+ "benchmark": "Hint: mention fact-checking datasets, prompt calibrations, and hallucination detection metrics.",
64
+ "process": "Hint: note expert fact-checkers consulted, provenance practices, and external audit reports."
65
+ },
66
+ "privacy-data": {
67
+ "benchmark": "Hint: include privacy tests, membership inference/MI defenses, and redaction results.",
68
+ "process": "Hint: list privacy officers consulted, data handling policies, and any regulatory mappings (e.g., GDPR)."
69
+ },
70
+ "bias-fairness": {
71
+ "benchmark": "Hint: indicate fairness metrics, subgroup breakdowns, and statistical significance of gaps.",
72
+ "process": "Hint: document which stakeholder groups and domain experts were engaged and mitigation steps taken."
73
+ },
74
+ "security-robustness": {
75
+ "benchmark": "Hint: report adversarial tests, perturbation strengths, and failure rates under attack.",
76
+ "process": "Hint: include red-team summaries, security reviewers, and incident response procedures."
77
+ },
78
+ "dangerous-capabilities": {
79
+ "benchmark": "Hint: describe tests for dual-use behaviors and misuse scenarios evaluated.",
80
+ "process": "Hint: note external safety reviews, legal counsel input, and controls/mitigations in place."
81
+ },
82
+ "human-ai-interaction": {
83
+ "benchmark": "Hint: list usability/UX tasks, user study protocols, and measures of over-reliance or deception.",
84
+ "process": "Hint: capture which user groups were involved, consent procedures, and human factors reviewers."
85
+ },
86
+ "environmental-impact": {
87
+ "benchmark": "Hint: report energy/perf tradeoff tests, FLOPs/throughput, and measured carbon estimates.",
88
+ "process": "Hint: include sustainability reviewers, lifecycle assessment notes, and mitigation plans."
89
+ },
90
+ "economic-displacement": {
91
+ "benchmark": "Hint: mention labor-impact scenarios evaluated and economic modeling assumptions used.",
92
+ "process": "Hint: document stakeholder consultations, affected worker groups engaged, and mitigation strategies."
93
+ },
94
+ "governance-accountability": {
95
+ "benchmark": "Hint: N/A for benchmarking; focus on process evidence instead.",
96
+ "process": "Hint: cite governance frameworks used, responsible owners, and escalation/audit trails."
97
+ },
98
+ "value-chain": {
99
+ "benchmark": "Hint: include supply-chain dependency tests, third-party component assessments if applicable.",
100
+ "process": "Hint: note vendor audits, data sourcing reviews, and contractual safeguards."
101
+ }
102
+ },
103
+ "categoryQuestionHints": {
104
+ "language-communication": {
105
+ "A1": { "benchmark": "List exact language benchmarks, dataset versions, prompt templates, split (train/val/test), and evaluation conditions." },
106
+ "A2": { "benchmark": "State numeric thresholds and which regulatory or domain thresholds apply (e.g., accuracy, FPR/FNR targets)." },
107
+ "A3": { "benchmark": "Provide side-by-side comparisons vs. baselines/SOTA, significance tests, and matched prompt/hyperparams." },
108
+ "A4": { "benchmark": "Describe adversarial or distribution-shift tests (prompt perturbations, paraphrase attacks) and failure rates." },
109
+ "A5": { "benchmark": "Explain live monitoring metrics (latency, error rate, hallucination rate), sampling cadence, and alert rules." },
110
+ "A6": { "benchmark": "Document overlap checks (n‑gram, URL hashing), contamination rates, and mitigation steps taken." },
111
+ "B1": { "process": "Define scope, claims being evaluated, success criteria (e.g., BLEU/F1 cutoffs), and evaluation hypotheses." },
112
+ "B2": { "process": "List reproducibility artifacts (code, prompts, seeds), availability level, and proxies if materials are restricted." },
113
+ "B3": { "process": "Name reviewers (linguists, annotators), review protocol, and how feedback was incorporated or adjudicated." },
114
+ "B4": { "process": "Show how figures present uncertainty (CI, SE), axes choices, sample sizes, and raw tables for transparency." },
115
+ "B5": { "process": "Reference any applicable standards (e.g., ISO, domain regs), mapping to practices, and noted gaps." },
116
+ "B6": { "process": "Describe re-eval triggers (model updates, drift), versioned specs, audit trails, and retest procedures." }
117
+ }
118
+ },
119
+ "recommendedBenchmarks": {
120
+ "language-communication": "e.g., MMLU, BBH, SuperGLUE",
121
+ "social-intelligence": "e.g., SocialIQA, EmoBench, PersonaChat (human-eval)",
122
+ "problem-solving": "e.g., GSM8K, MATH, HumanEval",
123
+ "creativity-innovation": "e.g., human preference studies, CREAM (human-eval)",
124
+ "learning-memory": "e.g., few-shot transfer suites, continual-learning benchmarks",
125
+ "perception-vision": "e.g., ImageNet, COCO, VQA",
126
+ "physical-manipulation": "e.g., RoboSuite, YCB benchmarks, real/sim task suites",
127
+ "metacognition": "e.g., calibration datasets (ECE), uncertainty benchmarks",
128
+ "robotic-intelligence": "e.g., Habitat, AI2-THOR, DARPA challenge tasks",
129
+ "harmful-content": "e.g., toxicity/harm benchmarks like ToxicBERT evals, red-team suites",
130
+ "information-integrity": "e.g., FEVER, fact-checking datasets, hallucination benchmarks",
131
+ "privacy-data": "e.g., membership-inference tests, MI challenge datasets",
132
+ "bias-fairness": "e.g., fairness benchmark suites (subgroup metrics), demographic breakdown tests",
133
+ "security-robustness": "e.g., adversarial robustness suites, attack-replay benchmarks",
134
+ "dangerous-capabilities": "e.g., dual-use/red-team evaluation suites (internal or published)",
135
+ "human-ai-interaction": "e.g., user-study protocols, SUS, human preference tests",
136
+ "environmental-impact": "e.g., FLOPs/energy measurement reports, carbon accounting tests",
137
+ "economic-displacement": "e.g., scenario/projection models, labor-impact analyses",
138
+ "governance-accountability": "e.g., audit logs, governance checklists (process evidence)",
139
+ "value-chain": "e.g., third-party audit reports, supply-chain assessments"
140
+ },
141
+ "recommendedMetrics": {
142
+ "language-communication": "e.g., accuracy, F1, BLEU, ROUGE, BERTScore",
143
+ "social-intelligence": "e.g., human rating scores, agreement rates, F1 for intent detection",
144
+ "problem-solving": "e.g., exact-match, pass@k, accuracy, solution correctness percentage",
145
+ "creativity-innovation": "e.g., human preference %, novelty/diversity scores",
146
+ "learning-memory": "e.g., few-shot accuracy, retention rate, forgetting metric",
147
+ "perception-vision": "e.g., mAP, IoU, top-1/top-5 accuracy",
148
+ "physical-manipulation": "e.g., success rate, collision rate, completion time",
149
+ "metacognition": "e.g., ECE, calibration error, confidence-accuracy correlation",
150
+ "robotic-intelligence": "e.g., task success rate, path efficiency, failure modes count",
151
+ "harmful-content": "e.g., toxicity rate, harmful-response rate, false negative rate for filters",
152
+ "information-integrity": "e.g., precision/recall of fact-checking, citation accuracy",
153
+ "privacy-data": "e.g., membership inference advantage, reconstruction error rates",
154
+ "bias-fairness": "e.g., subgroup parity gaps, disparate impact ratios, statistical significance",
155
+ "security-robustness": "e.g., attack success rate, robustness delta under perturbation",
156
+ "dangerous-capabilities": "e.g., misuse rate under red-team prompts, severity counts",
157
+ "human-ai-interaction": "e.g., SUS, task completion rate, user satisfaction scores",
158
+ "environmental-impact": "e.g., energy per inference, carbon per training run",
159
+ "economic-displacement": "e.g., projected job impact metrics, economic sensitivity metrics",
160
+ "governance-accountability": "e.g., audit coverage %, policy alignment scoring",
161
+ "value-chain": "e.g., vendor risk scores, dependency vulnerability counts"
162
+ },
163
+ "defaultHints": {
164
+ "benchmark": "Hint: include relevant benchmark settings, scoring rules, and notable limitations.",
165
+ "process": "Hint: mention reviewers consulted, applicable standards/regulations, and scope limitations."
166
+ }
167
+ }
schema/output-schema.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "string",
3
+ "systemName": "string",
4
+ "url": "string",
5
+ "provider": "string",
6
+ "version": "string",
7
+ "modelTag": "string",
8
+ "knowledgeCutoff": "YYYY-MM-DD",
9
+ "modelType": "foundational | fine-tuned | na",
10
+ "inputModalities": ["string"],
11
+ "outputModalities": ["string"],
12
+ "deploymentContexts": ["string"],
13
+ "evaluationDate": "YYYY-MM-DD",
14
+ "evaluator": "string",
15
+ "selectedCategories": [
16
+ "language-communication",
17
+ "problem-solving"
18
+ ],
19
+ "categoryEvaluations": {
20
+ "language-communication": {
21
+ "benchmarkAnswers": {
22
+ "A1": "yes",
23
+ "A2": "no",
24
+ "A3": "na",
25
+ "A4": "yes",
26
+ "A5": "no",
27
+ "A6": "yes"
28
+ },
29
+ "processAnswers": {
30
+ "B1": "yes",
31
+ "B2": "no",
32
+ "B3": "yes",
33
+ "B4": "no",
34
+ "B5": "yes",
35
+ "B6": "na"
36
+ },
37
+ "benchmarkSources": {
38
+ "A1": [
39
+ {
40
+ "id": "1",
41
+ "benchmarkName": "MMLU",
42
+ "version": "2023-05",
43
+ "taskVariants": "multiple choice",
44
+ "metrics": "accuracy",
45
+ "url": "https://example.org",
46
+ "description": "string",
47
+ "sourceType": "external",
48
+ "score": "86.4%",
49
+ "confidenceInterval": "95% CI [85,88]",
50
+ "customFields": {}
51
+ }
52
+ ]
53
+ },
54
+ "processSources": {
55
+ "B1": [
56
+ {
57
+ "id": "7",
58
+ "url": "https://example.org/doc",
59
+ "description": "string",
60
+ "sourceType": "internal",
61
+ "documentType": "Research Paper",
62
+ "title": "string",
63
+ "author": "string",
64
+ "organization": "string",
65
+ "date": "YYYY-MM-DD",
66
+ "customFields": {}
67
+ }
68
+ ]
69
+ },
70
+ "additionalAspects": "string",
71
+ "score": {
72
+ "benchmarkScore": 0,
73
+ "processScore": 0,
74
+ "totalScore": 0,
75
+ "status": "strong"
76
+ }
77
+ }
78
+ }
79
+ }
schema/system-info-schema.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "systemInfo": {
3
+ "id": "string",
4
+ "systemName": "string",
5
+ "url": "string",
6
+ "provider": "string",
7
+ "version": "string",
8
+ "modelTag": "string",
9
+ "knowledgeCutoff": "YYYY-MM-DD",
10
+ "modelType": "foundational | fine-tuned | na",
11
+ "inputModalities": ["string"],
12
+ "outputModalities": ["string"],
13
+ "systemTypes": ["string"],
14
+ "deploymentContexts": ["string"],
15
+ "deploymentContext": "string",
16
+ "evaluationDate": "YYYY-MM-DD",
17
+ "evaluator": "string"
18
+ },
19
+ "formOptions": {
20
+ "systemTypes": [
21
+ "Text-to-Text (e.g., chatbots, language models)",
22
+ "Text-to-Image (e.g., image generation)",
23
+ "Image-to-Text (e.g., image captioning, OCR)",
24
+ "Image-to-Image (e.g., image editing, style transfer)",
25
+ "Audio/Speech (e.g., speech recognition, text-to-speech)",
26
+ "Video (e.g., video generation, analysis)",
27
+ "Multimodal",
28
+ "Robotic/Embodied AI",
29
+ "Other"
30
+ ],
31
+ "modalities": [
32
+ "Text",
33
+ "Image",
34
+ "Audio",
35
+ "Video",
36
+ "Tabular",
37
+ "Robotics/Action",
38
+ "Other"
39
+ ],
40
+ "deploymentContexts": [
41
+ "Research/Academic",
42
+ "Internal/Enterprise Use",
43
+ "Public/Consumer-Facing",
44
+ "High-Risk Applications",
45
+ "Other"
46
+ ],
47
+ "modelTypes": [
48
+ { "value": "foundational", "label": "Foundational Model" },
49
+ { "value": "fine-tuned", "label": "Fine-tuned Model" },
50
+ { "value": "na", "label": "Doesn't apply" }
51
+ ]
52
+ }
53
+ }