Avijit Ghosh commited on
Commit
928f843
·
1 Parent(s): a9a31f7

even prettier cards

Browse files
app/evaluation/[id]/page.client.tsx CHANGED
@@ -11,7 +11,9 @@ import { ArrowLeft, Download, Eye, EyeOff, Info, Database, Globe, Calendar, User
11
  // Capability icons
12
  MessageCircle, Heart, Brain, Lightbulb, BookOpen, Camera, Hand, Search, Bot,
13
  // Risk icons
14
- Skull, AlertCircle, Lock, Zap, Gavel, Users, Leaf, TrendingDown, Scale, Factory } from "lucide-react"
 
 
15
  import { getAllCategories, getCategoryById, getBenchmarkQuestions, getProcessQuestions } from "@/lib/schema"
16
  import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"
17
  import { naReasonForCategoryFromEval } from "@/lib/na-utils"
@@ -41,6 +43,19 @@ const loadEvaluationDetails = async (id: string) => {
41
  return null
42
  }
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  // Category icon mapping
45
  const getCategoryIcon = (categoryId: string) => {
46
  const iconMap: Record<string, any> = {
@@ -79,6 +94,7 @@ export default function EvaluationDetailsPage() {
79
 
80
  const [evaluation, setEvaluation] = useState<any>(null)
81
  const [loading, setLoading] = useState(true)
 
82
  const [expandedAreas, setExpandedAreas] = useState<Record<string, boolean>>({})
83
  const toggleArea = (area: string) => setExpandedAreas((p) => ({ ...p, [area]: !p[area] }))
84
  const [expandedNegatives, setExpandedNegatives] = useState<Record<string, boolean>>({})
@@ -132,6 +148,33 @@ export default function EvaluationDetailsPage() {
132
  return naReasonForCategoryFromEval(catEval, benchmarkQs, processQs)
133
  }
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  // Compute overall stats from evaluation data dynamically
136
  const computedStats = (() => {
137
  const strongCategories: string[] = []
@@ -178,8 +221,12 @@ export default function EvaluationDetailsPage() {
178
 
179
  useEffect(() => {
180
  const loadData = async () => {
181
- const data = await loadEvaluationDetails(evaluationId)
182
- setEvaluation(data)
 
 
 
 
183
  setLoading(false)
184
  }
185
  loadData()
@@ -211,7 +258,8 @@ export default function EvaluationDetailsPage() {
211
  }
212
 
213
  return (
214
- <div className="container mx-auto px-4 py-8 max-w-4xl">
 
215
  <div className="mb-6">
216
  <div className="flex items-center justify-between">
217
  <Button onClick={() => router.push("/")} variant="outline" size="sm">
@@ -778,34 +826,49 @@ export default function EvaluationDetailsPage() {
778
  onClick={() => toggleNegatives(key)}
779
  className="p-4 hover:bg-muted/20 cursor-pointer transition-colors"
780
  >
781
- <div className="flex items-start justify-between gap-4">
782
- <div className="flex-1 min-w-0">
783
- <div className="flex items-center gap-2 mb-2">
784
- <span className="font-semibold text-sm bg-blue-50 dark:bg-blue-950 px-2 py-1 rounded text-blue-700 dark:text-blue-300">
785
- {questionId}
786
- </span>
787
- {hasYes ? (
788
- <Badge variant="default" className="bg-green-100 text-green-700 hover:bg-green-100">
789
- Yes
790
- </Badge>
791
- ) : hasNo ? (
792
- <Badge variant="destructive">
793
- No
794
- </Badge>
795
- ) : (
796
- <Badge variant="secondary" className="bg-gray-100 text-gray-600">
797
- N/A
798
- </Badge>
799
- )}
800
- </div>
801
  <div className="text-sm text-muted-foreground leading-relaxed">{qText}</div>
802
- {hasYes && (
803
- <div className="mt-2 text-xs text-blue-600 dark:text-blue-400">
804
- Click to {expandedNegatives[key] ? 'hide' : 'view'} benchmark details
805
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
806
  )}
807
  </div>
808
  </div>
 
 
 
 
 
809
  </div>
810
 
811
  {hasYes && expandedNegatives[key] && (() => {
@@ -953,42 +1016,125 @@ export default function EvaluationDetailsPage() {
953
  role="button"
954
  tabIndex={0}
955
  onClick={() => toggleNegatives(key)}
956
- className="flex items-center gap-2 mb-2 justify-between cursor-pointer"
957
  >
958
- <div className="flex items-center gap-3">
959
- <span className="font-medium">{questionId}:</span>
960
- <div className="text-sm">{qText}</div>
961
- </div>
962
- <div className="flex items-center gap-2">
963
- {hasYes ? (
964
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-green-100 text-green-700">yes</span>
965
- ) : hasNo ? (
966
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-red-100 text-red-700">no</span>
967
- ) : (
968
- <span className="inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium bg-muted/20 text-muted-foreground">n/a</span>
969
- )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
  </div>
 
 
 
 
 
971
  </div>
972
 
973
- {hasYes && expandedNegatives[key] && (
974
- <div className="mt-3 space-y-3">
975
- {(sources || []).map((src: any, i: number) => (
976
- <div key={i} className="p-3 bg-muted rounded">
977
- <div className="grid grid-cols-1 gap-2 text-sm">
978
- <div>
979
- <span className="text-muted-foreground">URL:</span> {src?.url || '—'}
 
 
 
 
980
  </div>
981
- <div>
982
- <span className="text-muted-foreground">Document Type:</span> {src?.documentType || src?.sourceType || '—'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983
  </div>
 
 
 
 
 
 
 
984
  </div>
985
- {src?.description && (
986
- <div className="mt-2 text-sm">
987
- <span className="text-muted-foreground">Description:</span> {src.description}
988
- </div>
989
- )}
990
- </div>
991
- ))}
992
  </div>
993
  )}
994
 
@@ -1016,5 +1162,6 @@ export default function EvaluationDetailsPage() {
1016
  )
1017
  })}
1018
  </div>
 
1019
  )
1020
  }
 
11
  // Capability icons
12
  MessageCircle, Heart, Brain, Lightbulb, BookOpen, Camera, Hand, Search, Bot,
13
  // Risk icons
14
+ Skull, AlertCircle, Lock, Zap, Gavel, Users, Leaf, TrendingDown, Scale, Factory,
15
+ // Process question icons
16
+ FileText, CheckCircle, XCircle, Minus, ChevronUp, ChevronDown } from "lucide-react"
17
  import { getAllCategories, getCategoryById, getBenchmarkQuestions, getProcessQuestions } from "@/lib/schema"
18
  import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip"
19
  import { naReasonForCategoryFromEval } from "@/lib/na-utils"
 
43
  return null
44
  }
45
 
46
+ const loadFormHints = async () => {
47
+ try {
48
+ const response = await fetch('/schema/form-hints.json')
49
+ if (!response.ok) {
50
+ throw new Error(`Failed to load form hints: ${response.statusText}`)
51
+ }
52
+ return response.json()
53
+ } catch (error) {
54
+ console.error('Failed to load form hints:', error)
55
+ return null
56
+ }
57
+ }
58
+
59
  // Category icon mapping
60
  const getCategoryIcon = (categoryId: string) => {
61
  const iconMap: Record<string, any> = {
 
94
 
95
  const [evaluation, setEvaluation] = useState<any>(null)
96
  const [loading, setLoading] = useState(true)
97
+ const [formHints, setFormHints] = useState<any>(null)
98
  const [expandedAreas, setExpandedAreas] = useState<Record<string, boolean>>({})
99
  const toggleArea = (area: string) => setExpandedAreas((p) => ({ ...p, [area]: !p[area] }))
100
  const [expandedNegatives, setExpandedNegatives] = useState<Record<string, boolean>>({})
 
148
  return naReasonForCategoryFromEval(catEval, benchmarkQs, processQs)
149
  }
150
 
151
+ // Get question hint from form hints
152
+ const getQuestionHint = (categoryId: string, questionId: string, type: 'benchmark' | 'process'): string | undefined => {
153
+ if (!formHints) return undefined
154
+
155
+ // Try category-specific question hints first
156
+ const categoryQuestionHints = formHints.categoryQuestionHints?.[categoryId]?.[questionId]
157
+ if (categoryQuestionHints?.[type]) {
158
+ return categoryQuestionHints[type]
159
+ }
160
+
161
+ // Fall back to general category hints
162
+ const categoryHints = formHints.categoryHints?.[categoryId]
163
+ if (categoryHints?.[type]) {
164
+ return categoryHints[type]
165
+ }
166
+
167
+ // Fall back to default hints
168
+ return formHints.defaultHints?.[type]
169
+ }
170
+
171
+ // Format hint text for tooltip display
172
+ const formatHintForTooltip = (hint: string): string => {
173
+ // Remove "Hint: " prefix and add "Explainer: " prefix
174
+ const cleanHint = hint.replace(/^Hint:\s*/i, '')
175
+ return `Explainer: ${cleanHint}`
176
+ }
177
+
178
  // Compute overall stats from evaluation data dynamically
179
  const computedStats = (() => {
180
  const strongCategories: string[] = []
 
221
 
222
  useEffect(() => {
223
  const loadData = async () => {
224
+ const [evalData, hintsData] = await Promise.all([
225
+ loadEvaluationDetails(evaluationId),
226
+ loadFormHints()
227
+ ])
228
+ setEvaluation(evalData)
229
+ setFormHints(hintsData)
230
  setLoading(false)
231
  }
232
  loadData()
 
258
  }
259
 
260
  return (
261
+ <TooltipProvider>
262
+ <div className="container mx-auto px-4 py-8 max-w-4xl">
263
  <div className="mb-6">
264
  <div className="flex items-center justify-between">
265
  <Button onClick={() => router.push("/")} variant="outline" size="sm">
 
826
  onClick={() => toggleNegatives(key)}
827
  className="p-4 hover:bg-muted/20 cursor-pointer transition-colors"
828
  >
829
+ <div className="flex items-center justify-between gap-4">
830
+ <div className="flex items-center gap-3">
831
+ <Tooltip>
832
+ <TooltipTrigger asChild>
833
+ <div className="text-sm font-semibold text-blue-700 dark:text-blue-300 bg-blue-100 dark:bg-blue-900 px-2 py-1 rounded-full border border-blue-300 dark:border-blue-700 min-w-[40px] text-center cursor-help">
834
+ {questionId}
835
+ </div>
836
+ </TooltipTrigger>
837
+ <TooltipContent className="max-w-sm">
838
+ <p className="text-sm">
839
+ {(() => {
840
+ const hint = getQuestionHint(categoryId, questionId, 'benchmark') || q.tooltip
841
+ return hint ? formatHintForTooltip(hint) : 'Explainer: No specific guidance available for this question.'
842
+ })()}
843
+ </p>
844
+ </TooltipContent>
845
+ </Tooltip>
 
 
 
846
  <div className="text-sm text-muted-foreground leading-relaxed">{qText}</div>
847
+ </div>
848
+ <div className="flex items-center gap-2">
849
+ {hasYes ? (
850
+ <Badge variant="outline" className="border-green-300 bg-green-50 text-green-700 dark:border-green-700 dark:bg-green-950 dark:text-green-300">
851
+ <CheckCircle className="h-3 w-3 mr-1" />
852
+ Yes
853
+ </Badge>
854
+ ) : hasNo ? (
855
+ <Badge variant="outline" className="border-red-300 bg-red-50 text-red-700 dark:border-red-700 dark:bg-red-950 dark:text-red-300">
856
+ <XCircle className="h-3 w-3 mr-1" />
857
+ No
858
+ </Badge>
859
+ ) : (
860
+ <Badge variant="outline" className="border-gray-300 bg-gray-50 text-gray-700 dark:border-gray-700 dark:bg-gray-950 dark:text-gray-300">
861
+ <Minus className="h-3 w-3 mr-1" />
862
+ N/A
863
+ </Badge>
864
  )}
865
  </div>
866
  </div>
867
+ {hasYes && (
868
+ <div className="mt-2 text-xs text-blue-600 dark:text-blue-400">
869
+ Click to {expandedNegatives[key] ? 'hide' : 'view'} benchmark details
870
+ </div>
871
+ )}
872
  </div>
873
 
874
  {hasYes && expandedNegatives[key] && (() => {
 
1016
  role="button"
1017
  tabIndex={0}
1018
  onClick={() => toggleNegatives(key)}
1019
+ className="cursor-pointer"
1020
  >
1021
+ <div className="flex items-center justify-between gap-4">
1022
+ <div className="flex items-center gap-3">
1023
+ <Tooltip>
1024
+ <TooltipTrigger asChild>
1025
+ <div className="text-sm font-semibold text-purple-700 dark:text-purple-300 bg-purple-100 dark:bg-purple-900 px-2 py-1 rounded-full border border-purple-300 dark:border-purple-700 min-w-[40px] text-center cursor-help">
1026
+ {questionId}
1027
+ </div>
1028
+ </TooltipTrigger>
1029
+ <TooltipContent className="max-w-sm">
1030
+ <p className="text-sm">
1031
+ {(() => {
1032
+ const hint = getQuestionHint(categoryId, questionId, 'process') || q.tooltip
1033
+ return hint ? formatHintForTooltip(hint) : 'Explainer: No specific guidance available for this question.'
1034
+ })()}
1035
+ </p>
1036
+ </TooltipContent>
1037
+ </Tooltip>
1038
+ <div className="text-sm text-muted-foreground leading-relaxed">{qText}</div>
1039
+ </div>
1040
+ <div className="flex items-center gap-2">
1041
+ {hasYes ? (
1042
+ <Badge variant="outline" className="border-green-300 bg-green-50 text-green-700 dark:border-green-700 dark:bg-green-950 dark:text-green-300">
1043
+ <CheckCircle className="h-3 w-3 mr-1" />
1044
+ Yes
1045
+ </Badge>
1046
+ ) : hasNo ? (
1047
+ <Badge variant="outline" className="border-red-300 bg-red-50 text-red-700 dark:border-red-700 dark:bg-red-950 dark:text-red-300">
1048
+ <XCircle className="h-3 w-3 mr-1" />
1049
+ No
1050
+ </Badge>
1051
+ ) : (
1052
+ <Badge variant="outline" className="border-gray-300 bg-gray-50 text-gray-700 dark:border-gray-700 dark:bg-gray-950 dark:text-gray-300">
1053
+ <Minus className="h-3 w-3 mr-1" />
1054
+ N/A
1055
+ </Badge>
1056
+ )}
1057
+ </div>
1058
  </div>
1059
+ {hasYes && sources.length > 0 && (
1060
+ <div className="mt-2 text-xs text-purple-600 dark:text-purple-400">
1061
+ Click to {expandedNegatives[key] ? 'hide' : 'view'} {sources.length} documentation source{sources.length !== 1 ? 's' : ''}
1062
+ </div>
1063
+ )}
1064
  </div>
1065
 
1066
+ {hasYes && expandedNegatives[key] && sources.length > 0 && (
1067
+ <div className="mt-3">
1068
+ <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
1069
+ {sources.map((src: any, i: number) => (
1070
+ <div key={i} className="bg-gradient-to-br from-white to-purple-50 dark:from-gray-900 dark:to-purple-950 border border-purple-200 dark:border-purple-800 rounded-lg p-4 hover:shadow-sm transition-shadow">
1071
+ {/* Header */}
1072
+ <div className="mb-3">
1073
+ <div className="text-xs font-medium text-purple-600 dark:text-purple-400 bg-purple-100 dark:bg-purple-900 px-2 py-1 rounded-full inline-flex items-center gap-1">
1074
+ <FileText className="h-3 w-3" />
1075
+ {src.documentType || 'Documentation'}
1076
+ </div>
1077
  </div>
1078
+
1079
+ {/* Title */}
1080
+ {src.title && (
1081
+ <div className="mb-3">
1082
+ <h4 className="font-semibold text-lg text-foreground leading-tight">{src.title}</h4>
1083
+ </div>
1084
+ )}
1085
+
1086
+ {/* Details */}
1087
+ <div className="space-y-2 text-sm">
1088
+ {src.author && (
1089
+ <div className="flex items-center gap-2">
1090
+ <span className="text-muted-foreground font-medium min-w-[60px]">Author:</span>
1091
+ <span className="text-foreground">{src.author}</span>
1092
+ </div>
1093
+ )}
1094
+
1095
+ {src.organization && (
1096
+ <div className="flex items-center gap-2">
1097
+ <span className="text-muted-foreground font-medium min-w-[60px]">Org:</span>
1098
+ <span className="text-foreground">{src.organization}</span>
1099
+ </div>
1100
+ )}
1101
+
1102
+ {src.date && (
1103
+ <div className="flex items-center gap-2">
1104
+ <span className="text-muted-foreground font-medium min-w-[60px]">Date:</span>
1105
+ <span className="text-foreground">{src.date}</span>
1106
+ </div>
1107
+ )}
1108
+
1109
+ <div className="flex items-start gap-2">
1110
+ <span className="text-muted-foreground font-medium min-w-[60px]">Source:</span>
1111
+ <div className="flex-1 min-w-0">
1112
+ {src.url ? (
1113
+ <a
1114
+ className="text-primary hover:text-primary/80 underline decoration-primary/30 hover:decoration-primary/60 transition-colors break-all"
1115
+ href={src.url}
1116
+ target="_blank"
1117
+ rel="noreferrer"
1118
+ title={src.url}
1119
+ >
1120
+ {src.url.length > 50 ? `${src.url.substring(0, 50)}...` : src.url}
1121
+ </a>
1122
+ ) : (
1123
+ <span className="text-muted-foreground">—</span>
1124
+ )}
1125
+ </div>
1126
+ </div>
1127
  </div>
1128
+
1129
+ {/* Description */}
1130
+ {src.description && (
1131
+ <div className="mt-3 p-3 bg-muted/30 dark:bg-muted/10 rounded-lg">
1132
+ <p className="text-sm text-muted-foreground leading-relaxed">{src.description}</p>
1133
+ </div>
1134
+ )}
1135
  </div>
1136
+ ))}
1137
+ </div>
 
 
 
 
 
1138
  </div>
1139
  )}
1140
 
 
1162
  )
1163
  })}
1164
  </div>
1165
+ </TooltipProvider>
1166
  )
1167
  }
public/evaluations/claude-3-sonnet.json CHANGED
@@ -191,29 +191,55 @@
191
  "B2": [
192
  {
193
  "id": "proc-meem78cs-563xw4",
194
- "url": "https://www.anthropic.com/research",
195
- "description": "Research publications and evaluation methodologies",
196
- "sourceType": "",
197
- "documentType": "Research Papers",
198
- "customFields": {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  }
200
  ],
201
  "B5": [
202
  {
203
- "id": "proc-meem78cs-0dwm24",
204
- "url": "https://www.anthropic.com/safety",
205
- "description": "External safety researcher review of language capabilities",
206
- "sourceType": "",
207
- "documentType": "Safety Assessment",
208
- "customFields": {}
 
 
209
  },
210
  {
211
- "id": "proc-meem78cs-o0usw6",
212
- "url": "https://www.anthropic.com/safety/continuous-improvement",
213
- "description": "Continuous evaluation and safety improvement process",
214
- "sourceType": "",
215
- "documentType": "Process Documentation",
216
- "customFields": {}
 
 
217
  }
218
  ],
219
  "B6": [
 
191
  "B2": [
192
  {
193
  "id": "proc-meem78cs-563xw4",
194
+ "url": "https://www.anthropic.com/research/claude-3-family",
195
+ "description": "Comprehensive research methodology and experimental design for Claude 3 family",
196
+ "title": "Claude 3 Family Technical Documentation",
197
+ "author": "Anthropic Research Team",
198
+ "organization": "Anthropic",
199
+ "date": "2024-03-04",
200
+ "documentType": "Research Paper"
201
+ },
202
+ {
203
+ "id": "proc-meem78cs-563xw5",
204
+ "url": "https://github.com/anthropics/anthropic-cookbook",
205
+ "description": "Open cookbook with evaluation prompts and reproducible examples",
206
+ "title": "Anthropic Evaluation Cookbook",
207
+ "author": "Developer Relations Team",
208
+ "organization": "Anthropic",
209
+ "date": "2024-03-15",
210
+ "documentType": "Code Repository"
211
+ },
212
+ {
213
+ "id": "proc-meem78cs-563xw6",
214
+ "url": "https://www.anthropic.com/safety/evaluation-standards",
215
+ "description": "Detailed evaluation standards and procedures for model assessment",
216
+ "title": "Model Evaluation Standards v3.2",
217
+ "author": "Safety Research Division",
218
+ "organization": "Anthropic",
219
+ "date": "2024-02-28",
220
+ "documentType": "Standards Document"
221
  }
222
  ],
223
  "B5": [
224
  {
225
+ "id": "proc-meem78cs-563xw7",
226
+ "url": "https://www.anthropic.com/compliance/ai-standards",
227
+ "description": "Alignment with industry AI safety and governance standards",
228
+ "title": "AI Standards Compliance Report",
229
+ "author": "Compliance Team",
230
+ "organization": "Anthropic",
231
+ "date": "2024-03-20",
232
+ "documentType": "Compliance Report"
233
  },
234
  {
235
+ "id": "proc-meem78cs-563xw8",
236
+ "url": "https://www.anthropic.com/research/constitutional-ai",
237
+ "description": "Constitutional AI methodology and regulatory alignment documentation",
238
+ "title": "Constitutional AI: Regulatory Alignment Framework",
239
+ "author": "AI Safety Research Team",
240
+ "organization": "Anthropic",
241
+ "date": "2024-01-15",
242
+ "documentType": "Framework Document"
243
  }
244
  ],
245
  "B6": [
public/evaluations/gpt-4-turbo.json CHANGED
@@ -193,20 +193,54 @@
193
  {
194
  "id": "proc-meem78d1-ur2bqa",
195
  "url": "https://github.com/openai/evals",
196
- "description": "Open-source evaluation framework and prompts",
197
- "sourceType": "",
198
- "documentType": "Code Repository",
199
- "customFields": {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  }
201
  ],
202
  "B3": [
203
  {
204
- "id": "proc-meem78d1-so2i6q",
205
- "url": "https://openai.com/research/gpt-4-system-card",
206
- "description": "External expert review of language capabilities",
207
- "sourceType": "",
208
- "documentType": "System Card",
209
- "customFields": {}
 
 
 
 
 
 
 
 
 
 
 
 
210
  }
211
  ],
212
  "B4": [
 
193
  {
194
  "id": "proc-meem78d1-ur2bqa",
195
  "url": "https://github.com/openai/evals",
196
+ "description": "Open-source evaluation framework and prompts for reproducible testing",
197
+ "title": "OpenAI Evals Framework",
198
+ "author": "OpenAI Research Team",
199
+ "organization": "OpenAI",
200
+ "date": "2024-04-15",
201
+ "documentType": "Code Repository"
202
+ },
203
+ {
204
+ "id": "proc-meem78d1-ur2bqb",
205
+ "url": "https://openai.com/research/gpt-4-technical-report",
206
+ "description": "Technical report containing detailed methodologies and experimental procedures",
207
+ "title": "GPT-4 Technical Report - Reproducibility Section",
208
+ "author": "OpenAI Safety Team",
209
+ "organization": "OpenAI",
210
+ "date": "2024-03-20",
211
+ "documentType": "Technical Report"
212
+ },
213
+ {
214
+ "id": "proc-meem78d1-ur2bqc",
215
+ "url": "https://openai.com/safety/reproducibility-guidelines",
216
+ "description": "Internal guidelines and procedures for ensuring evaluation reproducibility",
217
+ "title": "Model Evaluation Reproducibility Guidelines v2.1",
218
+ "author": "AI Safety Division",
219
+ "organization": "OpenAI",
220
+ "date": "2024-02-10",
221
+ "documentType": "Policy Document"
222
  }
223
  ],
224
  "B3": [
225
  {
226
+ "id": "proc-meem78d1-ur2bqd",
227
+ "url": "https://openai.com/research/expert-review-process",
228
+ "description": "Documentation of expert review process for language model capabilities",
229
+ "title": "Expert Review Process for Language Models",
230
+ "author": "External Advisory Board",
231
+ "organization": "OpenAI",
232
+ "date": "2024-04-01",
233
+ "documentType": "Process Documentation"
234
+ },
235
+ {
236
+ "id": "proc-meem78d1-ur2bqe",
237
+ "url": "https://openai.com/safety/feedback-incorporation",
238
+ "description": "Summary of expert feedback and how it was incorporated into final evaluations",
239
+ "title": "Expert Feedback Integration Report",
240
+ "author": "Safety Research Team",
241
+ "organization": "OpenAI",
242
+ "date": "2024-04-12",
243
+ "documentType": "Review Report"
244
  }
245
  ],
246
  "B4": [