Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fix enumaration in yourbench_task.py
Browse files
yourbench_space/lighteval_task/yourbench_task.py
CHANGED
|
@@ -56,10 +56,10 @@ JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a docum
|
|
| 56 |
4. **Ground Truth Answer Understanding**:
|
| 57 |
- Understand the provided ground truth answer, identifying its key points.
|
| 58 |
|
| 59 |
-
|
| 60 |
- Examine the Model Answer, identifying key points and assessing accuracy and factuality.
|
| 61 |
|
| 62 |
-
|
| 63 |
- 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
|
| 64 |
|
| 65 |
# Output Format
|
|
@@ -151,7 +151,7 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
|
| 151 |
chunk = kwargs.get("chunks", "")
|
| 152 |
summary = kwargs.get("documents", "")
|
| 153 |
|
| 154 |
-
|
| 155 |
{"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
|
| 156 |
{
|
| 157 |
"role": "user",
|
|
@@ -161,6 +161,8 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
|
|
| 161 |
},
|
| 162 |
]
|
| 163 |
|
|
|
|
|
|
|
| 164 |
|
| 165 |
def process_judge_response_yourbench(response):
|
| 166 |
# extract the final answer using regex from the response xml
|
|
@@ -175,13 +177,16 @@ def process_judge_response_yourbench(response):
|
|
| 175 |
class JudgeLLMYourBench(JudgeLLM):
|
| 176 |
def __init__(self):
|
| 177 |
super().__init__(
|
| 178 |
-
judge_model_name="
|
| 179 |
template=get_judge_prompt,
|
| 180 |
process_judge_response=process_judge_response_yourbench,
|
| 181 |
-
judge_backend="
|
| 182 |
short_judge_name="yourbench_judge",
|
|
|
|
|
|
|
| 183 |
)
|
| 184 |
|
|
|
|
| 185 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
| 186 |
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
|
| 187 |
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
|
|
|
|
| 56 |
4. **Ground Truth Answer Understanding**:
|
| 57 |
- Understand the provided ground truth answer, identifying its key points.
|
| 58 |
|
| 59 |
+
5. **Answer Understanding**:
|
| 60 |
- Examine the Model Answer, identifying key points and assessing accuracy and factuality.
|
| 61 |
|
| 62 |
+
6. **Final Answer**:
|
| 63 |
- 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
|
| 64 |
|
| 65 |
# Output Format
|
|
|
|
| 151 |
chunk = kwargs.get("chunks", "")
|
| 152 |
summary = kwargs.get("documents", "")
|
| 153 |
|
| 154 |
+
prompt = [
|
| 155 |
{"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
|
| 156 |
{
|
| 157 |
"role": "user",
|
|
|
|
| 161 |
},
|
| 162 |
]
|
| 163 |
|
| 164 |
+
return prompt
|
| 165 |
+
|
| 166 |
|
| 167 |
def process_judge_response_yourbench(response):
|
| 168 |
# extract the final answer using regex from the response xml
|
|
|
|
| 177 |
class JudgeLLMYourBench(JudgeLLM):
|
| 178 |
def __init__(self):
|
| 179 |
super().__init__(
|
| 180 |
+
judge_model_name="Qwen/QwQ-32B",
|
| 181 |
template=get_judge_prompt,
|
| 182 |
process_judge_response=process_judge_response_yourbench,
|
| 183 |
+
judge_backend="hf-inference",
|
| 184 |
short_judge_name="yourbench_judge",
|
| 185 |
+
hf_provider="novita",
|
| 186 |
+
max_tokens=2048,
|
| 187 |
)
|
| 188 |
|
| 189 |
+
|
| 190 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
| 191 |
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
|
| 192 |
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
|