advanced

Running on CPU Upgrade

App Files Files Community

alozowski HF Staff commited on Mar 31

Commit

baf4cf2

1 Parent(s): 18a3d4c

Fix enumaration in yourbench_task.py

Browse files

Files changed (1) hide show

yourbench_space/lighteval_task/yourbench_task.py +10 -5

yourbench_space/lighteval_task/yourbench_task.py CHANGED Viewed

@@ -56,10 +56,10 @@ JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a docum
 4. **Ground Truth Answer Understanding**:
    - Understand the provided ground truth answer, identifying its key points.
-6. **Answer Understanding**:
    - Examine the Model Answer, identifying key points and assessing accuracy and factuality.
-7. **Final Answer**:
    - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
 # Output Format
@@ -151,7 +151,7 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
     chunk = kwargs.get("chunks", "")
     summary = kwargs.get("documents", "")
-    return [
         {"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
         {
             "role": "user",
@@ -161,6 +161,8 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
         },
     ]
 def process_judge_response_yourbench(response):
     # extract the final answer using regex from the response xml
@@ -175,13 +177,16 @@ def process_judge_response_yourbench(response):
 class JudgeLLMYourBench(JudgeLLM):
     def __init__(self):
         super().__init__(
-            judge_model_name="gpt-4o-2024-08-06",
             template=get_judge_prompt,
             process_judge_response=process_judge_response_yourbench,
-            judge_backend="openai",
             short_judge_name="yourbench_judge",
         )
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]

 4. **Ground Truth Answer Understanding**:
    - Understand the provided ground truth answer, identifying its key points.
+5. **Answer Understanding**:
    - Examine the Model Answer, identifying key points and assessing accuracy and factuality.
+6. **Final Answer**:
    - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
 # Output Format
     chunk = kwargs.get("chunks", "")
     summary = kwargs.get("documents", "")
+    prompt = [
         {"role": "system", "content": JUDGE_ANSWER_SYSTEM_PROMPT},
         {
             "role": "user",
         },
     ]
+    return prompt
 def process_judge_response_yourbench(response):
     # extract the final answer using regex from the response xml
 class JudgeLLMYourBench(JudgeLLM):
     def __init__(self):
         super().__init__(
+            judge_model_name="Qwen/QwQ-32B",
             template=get_judge_prompt,
             process_judge_response=process_judge_response_yourbench,
+            judge_backend="hf-inference",
             short_judge_name="yourbench_judge",
+            hf_provider="novita",
+            max_tokens=2048,
         )
     def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
         # If we are evaluating a multiturn task, we need to have specific field in the formatted doc
         questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]