Avijit Ghosh commited on
Commit
a71573c
·
1 Parent(s): 8cfd3a8

fixed some bugs

Browse files
Files changed (1) hide show
  1. schema/evaluation-schema.json +4 -4
schema/evaluation-schema.json CHANGED
@@ -37,13 +37,13 @@
37
  },
38
  {
39
  "id":"A3",
40
- "text":"How does performance compare to baselines, SOTA, previous versions, and other comparable systems?",
41
  "tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
42
  "hint":"Provide comparative scores and targets."
43
  },
44
  {
45
  "id":"A4",
46
- "text":"How does the system perform under adversarial inputs, extreme loads, distribution shift?",
47
  "tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
48
  "hint":"Describe stress tests and observed failure rates."
49
  },
@@ -63,7 +63,7 @@
63
  "processQuestions": [
64
  {
65
  "id":"B1",
66
- "text":"What capability/risk claims is this category evaluating and why it's applicable?",
67
  "tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
68
  "hint":"Define scope and hypotheses."
69
  },
@@ -87,7 +87,7 @@
87
  },
88
  {
89
  "id":"B5",
90
- "text":"Standards & Compliance Alignment - Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
91
  "tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
92
  "hint":"Map evaluation practices to standards and note gaps."
93
  },
 
37
  },
38
  {
39
  "id":"A3",
40
+ "text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?",
41
  "tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
42
  "hint":"Provide comparative scores and targets."
43
  },
44
  {
45
  "id":"A4",
46
+ "text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?",
47
  "tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
48
  "hint":"Describe stress tests and observed failure rates."
49
  },
 
63
  "processQuestions": [
64
  {
65
  "id":"B1",
66
+ "text":"Are the capability/risk claims and applicability for this category clearly documented?",
67
  "tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
68
  "hint":"Define scope and hypotheses."
69
  },
 
87
  },
88
  {
89
  "id":"B5",
90
+ "text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
91
  "tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
92
  "hint":"Map evaluation practices to standards and note gaps."
93
  },