Spaces:
Running
Running
Avijit Ghosh
commited on
Commit
·
a71573c
1
Parent(s):
8cfd3a8
fixed some bugs
Browse files
schema/evaluation-schema.json
CHANGED
|
@@ -37,13 +37,13 @@
|
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"id":"A3",
|
| 40 |
-
"text":"
|
| 41 |
"tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
|
| 42 |
"hint":"Provide comparative scores and targets."
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"id":"A4",
|
| 46 |
-
"text":"
|
| 47 |
"tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
|
| 48 |
"hint":"Describe stress tests and observed failure rates."
|
| 49 |
},
|
|
@@ -63,7 +63,7 @@
|
|
| 63 |
"processQuestions": [
|
| 64 |
{
|
| 65 |
"id":"B1",
|
| 66 |
-
"text":"
|
| 67 |
"tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
|
| 68 |
"hint":"Define scope and hypotheses."
|
| 69 |
},
|
|
@@ -87,7 +87,7 @@
|
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"id":"B5",
|
| 90 |
-
"text":"
|
| 91 |
"tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
|
| 92 |
"hint":"Map evaluation practices to standards and note gaps."
|
| 93 |
},
|
|
|
|
| 37 |
},
|
| 38 |
{
|
| 39 |
"id":"A3",
|
| 40 |
+
"text":"Has performance been compared to baselines, SOTA, previous versions, and other comparable systems?",
|
| 41 |
"tooltip":"Expect: Side-by-side comparisons with SOTA models, previous versions, and similar systems under matched conditions, significance tests or confidence intervals for deltas.",
|
| 42 |
"hint":"Provide comparative scores and targets."
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"id":"A4",
|
| 46 |
+
"text":"Has the system been tested under adversarial inputs, extreme loads, or distribution shift?",
|
| 47 |
"tooltip":"Expect: Test types (attack/shift/load), rates of failure/degradation, robustness metrics.",
|
| 48 |
"hint":"Describe stress tests and observed failure rates."
|
| 49 |
},
|
|
|
|
| 63 |
"processQuestions": [
|
| 64 |
{
|
| 65 |
"id":"B1",
|
| 66 |
+
"text":"Are the capability/risk claims and applicability for this category clearly documented?",
|
| 67 |
"tooltip":"Expect: Clear scope, success/failure definitions, hypotheses the evaluation is testing.",
|
| 68 |
"hint":"Define scope and hypotheses."
|
| 69 |
},
|
|
|
|
| 87 |
},
|
| 88 |
{
|
| 89 |
"id":"B5",
|
| 90 |
+
"text":"Are evaluation practices aligned with relevant organizational, industry, or regulatory standards?",
|
| 91 |
"tooltip":"Expect: References to applicable standards/regulations, mapping of evaluation practices to those standards, any gaps or exemptions noted, and plan to address misalignment.",
|
| 92 |
"hint":"Map evaluation practices to standards and note gaps."
|
| 93 |
},
|