Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
d0e8be9
1
Parent(s):
a70555b
ruff format everything
Browse files- app.py +42 -39
- src/display/css_html_js.py +1 -1
- src/display/utils.py +3 -2
- src/gen/gen_answer.py +54 -46
- src/gen/gen_judgment.py +23 -22
- src/gen/show_result.py +49 -38
- src/gen/utils.py +25 -44
- src/leaderboard/build_leaderboard.py +34 -18
- src/leaderboard/filter_models.py +5 -6
- src/leaderboard/read_evals.py +27 -29
- src/populate.py +1 -3
- src/scripts/create_request_file.py +1 -1
- src/scripts/update_all_request_files.py +2 -2
- src/submission/check_validity.py +1 -1
- src/submission/submit.py +2 -22
- src/tools/plots.py +1 -1
app.py
CHANGED
|
@@ -24,39 +24,33 @@ from src.envs import (
|
|
| 24 |
)
|
| 25 |
from src.leaderboard.build_leaderboard import build_leadearboard_df
|
| 26 |
|
| 27 |
-
os.environ[
|
| 28 |
|
| 29 |
# Configure logging
|
| 30 |
-
logging.basicConfig(level=logging.INFO, format=
|
| 31 |
|
| 32 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 33 |
enable_space_ci()
|
| 34 |
|
|
|
|
| 35 |
def restart_space():
|
| 36 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 37 |
|
| 38 |
|
| 39 |
def build_demo():
|
| 40 |
-
demo = gr.Blocks(
|
| 41 |
-
title = "Chatbot Arena Leaderboard",
|
| 42 |
-
css=custom_css
|
| 43 |
-
)
|
| 44 |
leaderboard_df = build_leadearboard_df()
|
| 45 |
with demo:
|
| 46 |
gr.HTML(TITLE)
|
| 47 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 48 |
|
| 49 |
-
with gr.Tabs(elem_classes="tab-buttons")
|
| 50 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 51 |
-
|
| 52 |
value=leaderboard_df,
|
| 53 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 54 |
select_columns=SelectColumns(
|
| 55 |
-
default_selection=[
|
| 56 |
-
c.name
|
| 57 |
-
for c in fields(AutoEvalColumn)
|
| 58 |
-
if c.displayed_by_default
|
| 59 |
-
],
|
| 60 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
| 61 |
label="Select Columns to Display:",
|
| 62 |
),
|
|
@@ -67,50 +61,59 @@ def build_demo():
|
|
| 67 |
],
|
| 68 |
)
|
| 69 |
|
| 70 |
-
#with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
| 71 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 72 |
-
#with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=2):
|
| 73 |
# gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
| 74 |
|
| 75 |
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=3):
|
| 76 |
-
|
| 77 |
with gr.Row():
|
| 78 |
gr.Markdown("# ✨ Submit your model here!", elem_classes="markdown-text")
|
| 79 |
|
| 80 |
with gr.Column():
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
return demo
|
| 95 |
-
|
|
|
|
| 96 |
# print(os.system('cd src/gen && ../../.venv/bin/python gen_judgment.py'))
|
| 97 |
# print(os.system('cd src/gen/ && python show_result.py --output'))
|
| 98 |
-
|
|
|
|
| 99 |
def update_board():
|
| 100 |
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
|
| 101 |
-
if need_reset !=
|
| 102 |
return
|
| 103 |
-
os.environ[RESET_JUDGEMENT_ENV] =
|
| 104 |
-
subprocess.run([
|
| 105 |
-
subprocess.Popen(
|
| 106 |
|
| 107 |
|
| 108 |
if __name__ == "__main__":
|
| 109 |
-
os.environ[RESET_JUDGEMENT_ENV] =
|
| 110 |
-
|
| 111 |
scheduler = BackgroundScheduler()
|
| 112 |
-
scheduler.add_job(update_board, "interval", minutes=10)
|
| 113 |
scheduler.start()
|
| 114 |
-
|
| 115 |
demo_app = build_demo()
|
| 116 |
demo_app.launch(debug=True)
|
|
|
|
| 24 |
)
|
| 25 |
from src.leaderboard.build_leaderboard import build_leadearboard_df
|
| 26 |
|
| 27 |
+
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
|
| 28 |
|
| 29 |
# Configure logging
|
| 30 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 31 |
|
| 32 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 33 |
enable_space_ci()
|
| 34 |
|
| 35 |
+
|
| 36 |
def restart_space():
|
| 37 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 38 |
|
| 39 |
|
| 40 |
def build_demo():
|
| 41 |
+
demo = gr.Blocks(title="Chatbot Arena Leaderboard", css=custom_css)
|
|
|
|
|
|
|
|
|
|
| 42 |
leaderboard_df = build_leadearboard_df()
|
| 43 |
with demo:
|
| 44 |
gr.HTML(TITLE)
|
| 45 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 46 |
|
| 47 |
+
with gr.Tabs(elem_classes="tab-buttons"):
|
| 48 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 49 |
+
Leaderboard(
|
| 50 |
value=leaderboard_df,
|
| 51 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 52 |
select_columns=SelectColumns(
|
| 53 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
| 55 |
label="Select Columns to Display:",
|
| 56 |
),
|
|
|
|
| 61 |
],
|
| 62 |
)
|
| 63 |
|
| 64 |
+
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
| 65 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 66 |
+
# with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=2):
|
| 67 |
# gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
| 68 |
|
| 69 |
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=3):
|
|
|
|
| 70 |
with gr.Row():
|
| 71 |
gr.Markdown("# ✨ Submit your model here!", elem_classes="markdown-text")
|
| 72 |
|
| 73 |
with gr.Column():
|
| 74 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 75 |
+
|
| 76 |
+
def upload_file(file):
|
| 77 |
+
file_path = file.name.split("/")[-1] if "/" in file.name else file.name
|
| 78 |
+
logging.info("New submition: file saved to %s", file_path)
|
| 79 |
+
API.upload_file(
|
| 80 |
+
path_or_fileobj=file.name,
|
| 81 |
+
path_in_repo="./external/" + file_path,
|
| 82 |
+
repo_id="Vikhrmodels/openbench-eval",
|
| 83 |
+
repo_type="dataset",
|
| 84 |
+
)
|
| 85 |
+
os.environ[RESET_JUDGEMENT_ENV] = "1"
|
| 86 |
+
return file.name
|
| 87 |
+
|
| 88 |
+
if model_name_textbox:
|
| 89 |
+
file_output = gr.File()
|
| 90 |
+
upload_button = gr.UploadButton(
|
| 91 |
+
"Click to Upload & Submit Answers", file_types=["*"], file_count="single"
|
| 92 |
+
)
|
| 93 |
+
upload_button.upload(upload_file, upload_button, file_output)
|
| 94 |
+
|
| 95 |
return demo
|
| 96 |
+
|
| 97 |
+
|
| 98 |
# print(os.system('cd src/gen && ../../.venv/bin/python gen_judgment.py'))
|
| 99 |
# print(os.system('cd src/gen/ && python show_result.py --output'))
|
| 100 |
+
|
| 101 |
+
|
| 102 |
def update_board():
|
| 103 |
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
|
| 104 |
+
if need_reset != "1":
|
| 105 |
return
|
| 106 |
+
os.environ[RESET_JUDGEMENT_ENV] = "0"
|
| 107 |
+
subprocess.run(["python", "src/gen/gen_judgement.py"], check=False)
|
| 108 |
+
subprocess.Popen("python3.src/gen/show_result.py --output")
|
| 109 |
|
| 110 |
|
| 111 |
if __name__ == "__main__":
|
| 112 |
+
os.environ[RESET_JUDGEMENT_ENV] = "1"
|
| 113 |
+
|
| 114 |
scheduler = BackgroundScheduler()
|
| 115 |
+
scheduler.add_job(update_board, "interval", minutes=10)
|
| 116 |
scheduler.start()
|
| 117 |
+
|
| 118 |
demo_app = build_demo()
|
| 119 |
demo_app.launch(debug=True)
|
src/display/css_html_js.py
CHANGED
|
@@ -88,4 +88,4 @@ get_window_url_params = """
|
|
| 88 |
url_params = Object.fromEntries(params);
|
| 89 |
return url_params;
|
| 90 |
}
|
| 91 |
-
"""
|
|
|
|
| 88 |
url_params = Object.fromEntries(params);
|
| 89 |
return url_params;
|
| 90 |
}
|
| 91 |
+
"""
|
src/display/utils.py
CHANGED
|
@@ -7,7 +7,8 @@ import pandas as pd
|
|
| 7 |
|
| 8 |
|
| 9 |
# Configure logging
|
| 10 |
-
logging.basicConfig(level=logging.INFO, format=
|
|
|
|
| 11 |
|
| 12 |
def parse_datetime(datetime_str):
|
| 13 |
formats = [
|
|
@@ -25,6 +26,7 @@ def parse_datetime(datetime_str):
|
|
| 25 |
logging.error(f"No valid date format found for: {datetime_str}")
|
| 26 |
return datetime(1970, 1, 1)
|
| 27 |
|
|
|
|
| 28 |
def load_json_data(file_path):
|
| 29 |
"""Safely load JSON data from a file."""
|
| 30 |
try:
|
|
@@ -98,7 +100,6 @@ auto_eval_column_dict.append(["score", ColumnContent, ColumnContent("score", "nu
|
|
| 98 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 99 |
|
| 100 |
|
| 101 |
-
|
| 102 |
@dataclass(frozen=True)
|
| 103 |
class EvalQueueColumn: # Queue column
|
| 104 |
model = ColumnContent("model", "markdown", True)
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
# Configure logging
|
| 10 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 11 |
+
|
| 12 |
|
| 13 |
def parse_datetime(datetime_str):
|
| 14 |
formats = [
|
|
|
|
| 26 |
logging.error(f"No valid date format found for: {datetime_str}")
|
| 27 |
return datetime(1970, 1, 1)
|
| 28 |
|
| 29 |
+
|
| 30 |
def load_json_data(file_path):
|
| 31 |
"""Safely load JSON data from a file."""
|
| 32 |
try:
|
|
|
|
| 100 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 101 |
|
| 102 |
|
|
|
|
| 103 |
@dataclass(frozen=True)
|
| 104 |
class EvalQueueColumn: # Queue column
|
| 105 |
model = ColumnContent("model", "markdown", True)
|
src/gen/gen_answer.py
CHANGED
|
@@ -33,7 +33,14 @@ from utils import (
|
|
| 33 |
|
| 34 |
|
| 35 |
def get_answer(
|
| 36 |
-
question: dict,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
):
|
| 38 |
if question["category"] in temperature_config:
|
| 39 |
temperature = temperature_config[question["category"]]
|
|
@@ -54,49 +61,56 @@ def get_answer(
|
|
| 54 |
for j in range(len(question["turns"])):
|
| 55 |
conv.append({"role": "user", "content": question["turns"][j]["content"]})
|
| 56 |
if api_type == "anthropic":
|
| 57 |
-
output = chat_completion_anthropic(
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
max_tokens=max_tokens)
|
| 61 |
elif api_type == "mistral":
|
| 62 |
-
output = chat_completion_mistral(
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
max_tokens=max_tokens)
|
| 66 |
elif api_type == "yandex":
|
| 67 |
-
output = chat_completion_yandex(
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
elif api_type == "gigachat":
|
| 73 |
-
output = chat_completion_gigachat(
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
elif api_type == "gemini":
|
| 79 |
-
output = chat_completion_gemini(
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
| 83 |
elif api_type == "azure":
|
| 84 |
-
output = chat_completion_openai_azure(
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
elif api_type == "cohere":
|
| 90 |
-
output = chat_completion_cohere(
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
max_tokens=max_tokens)
|
| 94 |
else:
|
| 95 |
-
output = chat_completion_openai(
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
conv.append({"role": "assistant", "content": output})
|
| 101 |
|
| 102 |
turns.append({"content": output, "token_len": len(encoding.encode(output))})
|
|
@@ -118,12 +132,8 @@ def get_answer(
|
|
| 118 |
|
| 119 |
if __name__ == "__main__":
|
| 120 |
parser = argparse.ArgumentParser()
|
| 121 |
-
parser.add_argument(
|
| 122 |
-
|
| 123 |
-
)
|
| 124 |
-
parser.add_argument(
|
| 125 |
-
"--endpoint-file", type=str, default="config/api_config.yaml"
|
| 126 |
-
)
|
| 127 |
args = parser.parse_args()
|
| 128 |
|
| 129 |
settings = make_config(args.setting_file)
|
|
@@ -187,9 +197,7 @@ if __name__ == "__main__":
|
|
| 187 |
futures.append(future)
|
| 188 |
if count > 0:
|
| 189 |
print(f"{count} number of existing answers")
|
| 190 |
-
for future in tqdm.tqdm(
|
| 191 |
-
concurrent.futures.as_completed(futures), total=len(futures)
|
| 192 |
-
):
|
| 193 |
future.result()
|
| 194 |
|
| 195 |
reorg_answer_file(answer_file)
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def get_answer(
|
| 36 |
+
question: dict,
|
| 37 |
+
model: str,
|
| 38 |
+
endpoint_info: dict,
|
| 39 |
+
num_choices: int,
|
| 40 |
+
max_tokens: int,
|
| 41 |
+
temperature: float,
|
| 42 |
+
answer_file: str,
|
| 43 |
+
api_dict: dict,
|
| 44 |
):
|
| 45 |
if question["category"] in temperature_config:
|
| 46 |
temperature = temperature_config[question["category"]]
|
|
|
|
| 61 |
for j in range(len(question["turns"])):
|
| 62 |
conv.append({"role": "user", "content": question["turns"][j]["content"]})
|
| 63 |
if api_type == "anthropic":
|
| 64 |
+
output = chat_completion_anthropic(
|
| 65 |
+
model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
|
| 66 |
+
)
|
|
|
|
| 67 |
elif api_type == "mistral":
|
| 68 |
+
output = chat_completion_mistral(
|
| 69 |
+
model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
|
| 70 |
+
)
|
|
|
|
| 71 |
elif api_type == "yandex":
|
| 72 |
+
output = chat_completion_yandex(
|
| 73 |
+
model=endpoint_info["model_name"],
|
| 74 |
+
messages=conv,
|
| 75 |
+
temperature=temperature,
|
| 76 |
+
max_tokens=max_tokens,
|
| 77 |
+
api_dict=api_dict,
|
| 78 |
+
)
|
| 79 |
elif api_type == "gigachat":
|
| 80 |
+
output = chat_completion_gigachat(
|
| 81 |
+
model=endpoint_info["model_name"],
|
| 82 |
+
messages=conv,
|
| 83 |
+
temperature=temperature,
|
| 84 |
+
max_tokens=max_tokens,
|
| 85 |
+
api_dict=api_dict,
|
| 86 |
+
)
|
| 87 |
elif api_type == "gemini":
|
| 88 |
+
output = chat_completion_gemini(
|
| 89 |
+
model=endpoint_info["model_name"],
|
| 90 |
+
messages=question["turns"][j]["content"],
|
| 91 |
+
temperature=temperature,
|
| 92 |
+
max_tokens=max_tokens,
|
| 93 |
+
)
|
| 94 |
elif api_type == "azure":
|
| 95 |
+
output = chat_completion_openai_azure(
|
| 96 |
+
model=endpoint_info["model_name"],
|
| 97 |
+
messages=conv,
|
| 98 |
+
temperature=temperature,
|
| 99 |
+
max_tokens=max_tokens,
|
| 100 |
+
api_dict=api_dict,
|
| 101 |
+
)
|
| 102 |
elif api_type == "cohere":
|
| 103 |
+
output = chat_completion_cohere(
|
| 104 |
+
model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
|
| 105 |
+
)
|
|
|
|
| 106 |
else:
|
| 107 |
+
output = chat_completion_openai(
|
| 108 |
+
model=endpoint_info["model_name"],
|
| 109 |
+
messages=conv,
|
| 110 |
+
temperature=temperature,
|
| 111 |
+
max_tokens=max_tokens,
|
| 112 |
+
api_dict=api_dict,
|
| 113 |
+
)
|
| 114 |
conv.append({"role": "assistant", "content": output})
|
| 115 |
|
| 116 |
turns.append({"content": output, "token_len": len(encoding.encode(output))})
|
|
|
|
| 132 |
|
| 133 |
if __name__ == "__main__":
|
| 134 |
parser = argparse.ArgumentParser()
|
| 135 |
+
parser.add_argument("--setting-file", type=str, default="config/gen_answer_config.yaml")
|
| 136 |
+
parser.add_argument("--endpoint-file", type=str, default="config/api_config.yaml")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
args = parser.parse_args()
|
| 138 |
|
| 139 |
settings = make_config(args.setting_file)
|
|
|
|
| 197 |
futures.append(future)
|
| 198 |
if count > 0:
|
| 199 |
print(f"{count} number of existing answers")
|
| 200 |
+
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
|
|
|
|
|
|
| 201 |
future.result()
|
| 202 |
|
| 203 |
reorg_answer_file(answer_file)
|
src/gen/gen_judgment.py
CHANGED
|
@@ -55,12 +55,7 @@ def judgment(**args):
|
|
| 55 |
|
| 56 |
num_games = 2 if configs["pairwise"] else 1
|
| 57 |
|
| 58 |
-
output = {
|
| 59 |
-
"question_id":question["question_id"],
|
| 60 |
-
"model":answer["model_id"],
|
| 61 |
-
"judge": model,
|
| 62 |
-
"games":[]
|
| 63 |
-
}
|
| 64 |
|
| 65 |
for game in range(num_games):
|
| 66 |
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
|
@@ -73,7 +68,7 @@ def judgment(**args):
|
|
| 73 |
base = 1
|
| 74 |
|
| 75 |
if baseline:
|
| 76 |
-
if game % 2 == 1:
|
| 77 |
temp = baseline
|
| 78 |
baseline = answer
|
| 79 |
answer = temp
|
|
@@ -103,7 +98,7 @@ def judgment(**args):
|
|
| 103 |
args["endpoint_dict"],
|
| 104 |
)
|
| 105 |
|
| 106 |
-
judgment +=
|
| 107 |
|
| 108 |
score, try_again = get_score(judgment, args["regex_pattern"])
|
| 109 |
|
|
@@ -112,18 +107,21 @@ def judgment(**args):
|
|
| 112 |
if not try_again:
|
| 113 |
break
|
| 114 |
|
| 115 |
-
conv.append(
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
result = {
|
| 118 |
-
"user_prompt": conv[1]["content"],
|
| 119 |
-
"judgment": judgment,
|
| 120 |
-
"score":score
|
| 121 |
-
}
|
| 122 |
output["games"].append(result)
|
| 123 |
|
| 124 |
with open(output_file, "a") as f:
|
| 125 |
f.write(json.dumps(output, ensure_ascii=False) + "\n")
|
| 126 |
-
huggingface_hub.HfApi().upload_file(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
if __name__ == "__main__":
|
|
@@ -136,8 +134,10 @@ if __name__ == "__main__":
|
|
| 136 |
configs = make_config(args.setting_file)
|
| 137 |
endpoint_list = make_config(args.endpoint_file)
|
| 138 |
|
| 139 |
-
print(
|
| 140 |
-
|
|
|
|
|
|
|
| 141 |
|
| 142 |
if configs["regex_pattern"]:
|
| 143 |
pattern = re.compile(configs["regex_pattern"])
|
|
@@ -150,12 +150,15 @@ if __name__ == "__main__":
|
|
| 150 |
questions = load_questions(question_file)
|
| 151 |
model_answers_external = load_model_answers(external_dir)
|
| 152 |
model_answers_internal = load_model_answers(internal_dir)
|
| 153 |
-
|
| 154 |
# internal has priority
|
| 155 |
model_answers = {**model_answers_external, **model_answers_internal}
|
| 156 |
|
| 157 |
# if user choose a set of models, only judge those models
|
| 158 |
-
models = [
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
ref_answers = None
|
| 161 |
if configs["reference"]:
|
|
@@ -214,7 +217,5 @@ if __name__ == "__main__":
|
|
| 214 |
if count > 0:
|
| 215 |
print(f"{count} number of existing judgments")
|
| 216 |
|
| 217 |
-
for future in tqdm(
|
| 218 |
-
concurrent.futures.as_completed(futures), total=len(futures)
|
| 219 |
-
):
|
| 220 |
future.result()
|
|
|
|
| 55 |
|
| 56 |
num_games = 2 if configs["pairwise"] else 1
|
| 57 |
|
| 58 |
+
output = {"question_id": question["question_id"], "model": answer["model_id"], "judge": model, "games": []}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
for game in range(num_games):
|
| 61 |
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
|
|
|
| 68 |
base = 1
|
| 69 |
|
| 70 |
if baseline:
|
| 71 |
+
if game % 2 == 1: # swap position
|
| 72 |
temp = baseline
|
| 73 |
baseline = answer
|
| 74 |
answer = temp
|
|
|
|
| 98 |
args["endpoint_dict"],
|
| 99 |
)
|
| 100 |
|
| 101 |
+
judgment += "\n" + new_judgment
|
| 102 |
|
| 103 |
score, try_again = get_score(judgment, args["regex_pattern"])
|
| 104 |
|
|
|
|
| 107 |
if not try_again:
|
| 108 |
break
|
| 109 |
|
| 110 |
+
conv.append(
|
| 111 |
+
{"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"}
|
| 112 |
+
)
|
| 113 |
|
| 114 |
+
result = {"user_prompt": conv[1]["content"], "judgment": judgment, "score": score}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
output["games"].append(result)
|
| 116 |
|
| 117 |
with open(output_file, "a") as f:
|
| 118 |
f.write(json.dumps(output, ensure_ascii=False) + "\n")
|
| 119 |
+
huggingface_hub.HfApi().upload_file(
|
| 120 |
+
output_file,
|
| 121 |
+
path_in_repo=f'model_judgment/{configs['judge_model']}/{output_file.split('/')[-1]}',
|
| 122 |
+
repo_id="Vikhrmodels/openbench-eval",
|
| 123 |
+
repo_type="dataset",
|
| 124 |
+
)
|
| 125 |
|
| 126 |
|
| 127 |
if __name__ == "__main__":
|
|
|
|
| 134 |
configs = make_config(args.setting_file)
|
| 135 |
endpoint_list = make_config(args.endpoint_file)
|
| 136 |
|
| 137 |
+
print(
|
| 138 |
+
f'judge model: {configs["judge_model"]}, baseline: {configs["baseline"]}, baseline model: {configs["baseline_model"]}, reference: {configs["reference"]}, '
|
| 139 |
+
+ f'reference models: {configs["ref_model"]}, temperature: {configs["temperature"]}, max tokens: {configs["max_tokens"]}, pairwise: {configs["pairwise"]}'
|
| 140 |
+
)
|
| 141 |
|
| 142 |
if configs["regex_pattern"]:
|
| 143 |
pattern = re.compile(configs["regex_pattern"])
|
|
|
|
| 150 |
questions = load_questions(question_file)
|
| 151 |
model_answers_external = load_model_answers(external_dir)
|
| 152 |
model_answers_internal = load_model_answers(internal_dir)
|
| 153 |
+
|
| 154 |
# internal has priority
|
| 155 |
model_answers = {**model_answers_external, **model_answers_internal}
|
| 156 |
|
| 157 |
# if user choose a set of models, only judge those models
|
| 158 |
+
models = [
|
| 159 |
+
model.split("/")[-1].split(".")[0]
|
| 160 |
+
for model in glob.glob("./data/arena-hard-v0.1/model_answer/external/*.jsonl")
|
| 161 |
+
]
|
| 162 |
|
| 163 |
ref_answers = None
|
| 164 |
if configs["reference"]:
|
|
|
|
| 217 |
if count > 0:
|
| 218 |
print(f"{count} number of existing judgments")
|
| 219 |
|
| 220 |
+
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
|
|
|
|
|
|
| 221 |
future.result()
|
src/gen/show_result.py
CHANGED
|
@@ -2,7 +2,6 @@ import pandas as pd
|
|
| 2 |
import numpy as np
|
| 3 |
import plotly.express as px
|
| 4 |
|
| 5 |
-
import tiktoken
|
| 6 |
import datetime
|
| 7 |
import argparse
|
| 8 |
import os
|
|
@@ -15,6 +14,7 @@ from sklearn.linear_model import LogisticRegression
|
|
| 15 |
from collections import defaultdict
|
| 16 |
from utils import load_model_answers
|
| 17 |
|
|
|
|
| 18 |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
| 19 |
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
| 20 |
models = pd.Series(np.arange(len(models)), index=models)
|
|
@@ -35,18 +35,18 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
| 35 |
# one tie => one A win + one B win
|
| 36 |
# find tie + tie (both bad) index
|
| 37 |
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
| 38 |
-
tie_idx[len(tie_idx)//2:] = False
|
| 39 |
Y[tie_idx] = 1.0
|
| 40 |
|
| 41 |
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
| 42 |
-
lr.fit(X,Y)
|
| 43 |
|
| 44 |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
| 45 |
|
| 46 |
# set anchor as gpt-3.5-turbo-0125 = 1000
|
| 47 |
if "gpt-3.5-turbo-0125" in models.index:
|
| 48 |
elo_scores += 1000 - elo_scores[models["gpt-3.5-turbo-0125"]]
|
| 49 |
-
return pd.Series(elo_scores, index
|
| 50 |
|
| 51 |
|
| 52 |
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
@@ -58,9 +58,14 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
| 58 |
|
| 59 |
|
| 60 |
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
| 61 |
-
df =
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
| 65 |
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
| 66 |
df.index = df.index + 1
|
|
@@ -68,18 +73,24 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|
| 68 |
|
| 69 |
|
| 70 |
def visualize_bootstrap_scores(df, title):
|
| 71 |
-
bars =
|
| 72 |
-
lower = df.quantile(.
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
bars[
|
| 77 |
-
bars[
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
return fig
|
| 84 |
|
| 85 |
|
|
@@ -92,10 +103,7 @@ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
| 92 |
wins[a][b] = ea
|
| 93 |
wins[b][a] = 1 - ea
|
| 94 |
|
| 95 |
-
data = {
|
| 96 |
-
a: [wins[a][b] if a != b else np.NAN for b in names]
|
| 97 |
-
for a in names
|
| 98 |
-
}
|
| 99 |
|
| 100 |
df = pd.DataFrame(data, index=names)
|
| 101 |
df.index.name = "model_a"
|
|
@@ -121,9 +129,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3):
|
|
| 121 |
|
| 122 |
for _, row in df.iterrows():
|
| 123 |
# game 1
|
| 124 |
-
output = {"question_id": row["question_id"],
|
| 125 |
-
"model_a": "gpt-3.5-turbo-0125",
|
| 126 |
-
"model_b": row["model"]}
|
| 127 |
|
| 128 |
game = row["games"][0]
|
| 129 |
|
|
@@ -148,9 +154,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3):
|
|
| 148 |
|
| 149 |
if not first_game_only:
|
| 150 |
# game 2
|
| 151 |
-
output = {"question_id": row["question_id"],
|
| 152 |
-
"model_a": "gpt-3.5-turbo-0125",
|
| 153 |
-
"model_b": row["model"]}
|
| 154 |
|
| 155 |
game = row["games"][1]
|
| 156 |
|
|
@@ -190,7 +194,9 @@ if __name__ == "__main__":
|
|
| 190 |
parser.add_argument("--first-game-only", action="store_true")
|
| 191 |
args = parser.parse_args()
|
| 192 |
print(args)
|
| 193 |
-
assert not args.load_bootstrap or (
|
|
|
|
|
|
|
| 194 |
|
| 195 |
answer_dir = os.path.join("data", args.bench_name, "model_answer/external")
|
| 196 |
model_answers = load_model_answers(answer_dir)
|
|
@@ -203,7 +209,6 @@ if __name__ == "__main__":
|
|
| 203 |
|
| 204 |
bootstrap_online_elo = compute_mle_elo(battles)
|
| 205 |
|
| 206 |
-
|
| 207 |
if args.load_bootstrap:
|
| 208 |
bootstrap_elo_lu = pd.read_json("data/bootstrapping_results.jsonl", lines=True)
|
| 209 |
else:
|
|
@@ -213,7 +218,7 @@ if __name__ == "__main__":
|
|
| 213 |
|
| 214 |
stats = pd.DataFrame()
|
| 215 |
stats["results"] = None
|
| 216 |
-
stats["results"] = stats[
|
| 217 |
|
| 218 |
for i, model in enumerate(bootstrap_online_elo.index):
|
| 219 |
assert model in bootstrap_elo_lu.columns
|
|
@@ -241,18 +246,24 @@ if __name__ == "__main__":
|
|
| 241 |
decimal = 1
|
| 242 |
else:
|
| 243 |
decimal = 0
|
| 244 |
-
stats = stats.astype({"score"
|
| 245 |
|
| 246 |
stats.sort_values(by="score", ascending=False, inplace=True)
|
| 247 |
for _, row in stats.iterrows():
|
| 248 |
-
interval = str((round(row[
|
| 249 |
-
print(
|
|
|
|
|
|
|
| 250 |
|
| 251 |
if args.output:
|
| 252 |
cur_date = datetime.datetime.now()
|
| 253 |
date_str = cur_date.strftime("%Y%m%d")
|
| 254 |
stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
|
| 255 |
import huggingface_hub
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
import plotly.express as px
|
| 4 |
|
|
|
|
| 5 |
import datetime
|
| 6 |
import argparse
|
| 7 |
import os
|
|
|
|
| 14 |
from collections import defaultdict
|
| 15 |
from utils import load_model_answers
|
| 16 |
|
| 17 |
+
|
| 18 |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
| 19 |
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
| 20 |
models = pd.Series(np.arange(len(models)), index=models)
|
|
|
|
| 35 |
# one tie => one A win + one B win
|
| 36 |
# find tie + tie (both bad) index
|
| 37 |
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
| 38 |
+
tie_idx[len(tie_idx) // 2 :] = False
|
| 39 |
Y[tie_idx] = 1.0
|
| 40 |
|
| 41 |
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
| 42 |
+
lr.fit(X, Y)
|
| 43 |
|
| 44 |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
| 45 |
|
| 46 |
# set anchor as gpt-3.5-turbo-0125 = 1000
|
| 47 |
if "gpt-3.5-turbo-0125" in models.index:
|
| 48 |
elo_scores += 1000 - elo_scores[models["gpt-3.5-turbo-0125"]]
|
| 49 |
+
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
| 50 |
|
| 51 |
|
| 52 |
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
| 61 |
+
df = (
|
| 62 |
+
pd.DataFrame(
|
| 63 |
+
[[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
|
| 64 |
+
columns=["Model", column_names[0], column_names[1]],
|
| 65 |
+
)
|
| 66 |
+
.sort_values(column_names[0], ascending=False)
|
| 67 |
+
.reset_index(drop=True)
|
| 68 |
+
)
|
| 69 |
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
| 70 |
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
| 71 |
df.index = df.index + 1
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
def visualize_bootstrap_scores(df, title):
|
| 76 |
+
bars = (
|
| 77 |
+
pd.DataFrame(dict(lower=df.quantile(0.025), rating=df.quantile(0.5), upper=df.quantile(0.975)))
|
| 78 |
+
.reset_index(names="model")
|
| 79 |
+
.sort_values("rating", ascending=False)
|
| 80 |
+
)
|
| 81 |
+
bars["error_y"] = bars["upper"] - bars["rating"]
|
| 82 |
+
bars["error_y_minus"] = bars["rating"] - bars["lower"]
|
| 83 |
+
bars["rating_rounded"] = np.round(bars["rating"], 2)
|
| 84 |
+
fig = px.scatter(
|
| 85 |
+
bars,
|
| 86 |
+
x="model",
|
| 87 |
+
y="rating",
|
| 88 |
+
error_y="error_y",
|
| 89 |
+
error_y_minus="error_y_minus",
|
| 90 |
+
text="rating_rounded",
|
| 91 |
+
title=title,
|
| 92 |
+
)
|
| 93 |
+
fig.update_layout(xaxis_title="Model", yaxis_title="Rating", height=600)
|
| 94 |
return fig
|
| 95 |
|
| 96 |
|
|
|
|
| 103 |
wins[a][b] = ea
|
| 104 |
wins[b][a] = 1 - ea
|
| 105 |
|
| 106 |
+
data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
df = pd.DataFrame(data, index=names)
|
| 109 |
df.index.name = "model_a"
|
|
|
|
| 129 |
|
| 130 |
for _, row in df.iterrows():
|
| 131 |
# game 1
|
| 132 |
+
output = {"question_id": row["question_id"], "model_a": "gpt-3.5-turbo-0125", "model_b": row["model"]}
|
|
|
|
|
|
|
| 133 |
|
| 134 |
game = row["games"][0]
|
| 135 |
|
|
|
|
| 154 |
|
| 155 |
if not first_game_only:
|
| 156 |
# game 2
|
| 157 |
+
output = {"question_id": row["question_id"], "model_a": "gpt-3.5-turbo-0125", "model_b": row["model"]}
|
|
|
|
|
|
|
| 158 |
|
| 159 |
game = row["games"][1]
|
| 160 |
|
|
|
|
| 194 |
parser.add_argument("--first-game-only", action="store_true")
|
| 195 |
args = parser.parse_args()
|
| 196 |
print(args)
|
| 197 |
+
assert not args.load_bootstrap or (
|
| 198 |
+
args.load_battles and args.load_bootstrap
|
| 199 |
+
), "If loading prexisting bootstrapping data, you must also load preexisting battles."
|
| 200 |
|
| 201 |
answer_dir = os.path.join("data", args.bench_name, "model_answer/external")
|
| 202 |
model_answers = load_model_answers(answer_dir)
|
|
|
|
| 209 |
|
| 210 |
bootstrap_online_elo = compute_mle_elo(battles)
|
| 211 |
|
|
|
|
| 212 |
if args.load_bootstrap:
|
| 213 |
bootstrap_elo_lu = pd.read_json("data/bootstrapping_results.jsonl", lines=True)
|
| 214 |
else:
|
|
|
|
| 218 |
|
| 219 |
stats = pd.DataFrame()
|
| 220 |
stats["results"] = None
|
| 221 |
+
stats["results"] = stats["results"].astype("object")
|
| 222 |
|
| 223 |
for i, model in enumerate(bootstrap_online_elo.index):
|
| 224 |
assert model in bootstrap_elo_lu.columns
|
|
|
|
| 246 |
decimal = 1
|
| 247 |
else:
|
| 248 |
decimal = 0
|
| 249 |
+
stats = stats.astype({"score": int, "lower": int, "upper": int})
|
| 250 |
|
| 251 |
stats.sort_values(by="score", ascending=False, inplace=True)
|
| 252 |
for _, row in stats.iterrows():
|
| 253 |
+
interval = str((round(row["lower"] - row["score"], decimal), round(row["upper"] - row["score"], decimal)))
|
| 254 |
+
print(
|
| 255 |
+
f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}"
|
| 256 |
+
)
|
| 257 |
|
| 258 |
if args.output:
|
| 259 |
cur_date = datetime.datetime.now()
|
| 260 |
date_str = cur_date.strftime("%Y%m%d")
|
| 261 |
stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
|
| 262 |
import huggingface_hub
|
| 263 |
+
|
| 264 |
+
huggingface_hub.HfApi().upload_file(
|
| 265 |
+
path_or_fileobj=f"arena_hard_leaderboard_{date_str}.json",
|
| 266 |
+
path_in_repo="evals/upd.json",
|
| 267 |
+
repo_id="Vikhrmodels/openbench-eval",
|
| 268 |
+
repo_type="dataset",
|
| 269 |
+
)
|
src/gen/utils.py
CHANGED
|
@@ -77,9 +77,7 @@ def get_endpoint(endpoint_list):
|
|
| 77 |
return None
|
| 78 |
assert endpoint_list is not None
|
| 79 |
# randomly pick one
|
| 80 |
-
api_dict = random.choices(
|
| 81 |
-
endpoint_list
|
| 82 |
-
)[0]
|
| 83 |
return api_dict
|
| 84 |
|
| 85 |
|
|
@@ -91,9 +89,11 @@ def make_config(config_file: str) -> dict:
|
|
| 91 |
|
| 92 |
return config_kwargs
|
| 93 |
|
|
|
|
| 94 |
def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=None):
|
| 95 |
from gigachat import GigaChat
|
| 96 |
from gigachat.models import Chat, Messages
|
|
|
|
| 97 |
assert api_dict is not None, "no api settings provided!"
|
| 98 |
auth_token = api_dict.get("auth_token", os.environ.get(api_dict["auth_token"], ""))
|
| 99 |
client = GigaChat(credentials=auth_token, model=model, verify_ssl_certs=False)
|
|
@@ -115,15 +115,13 @@ def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=
|
|
| 115 |
|
| 116 |
return output
|
| 117 |
|
|
|
|
| 118 |
def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=None):
|
| 119 |
from yandex_gpt import YandexGPT, YandexGPTConfigManagerForIAMToken
|
|
|
|
| 120 |
assert api_dict is not None, "no api settings provided!"
|
| 121 |
iam_token = api_dict.get("iam_token", os.environ.get(api_dict["iam_token_ENV"], ""))
|
| 122 |
-
config = YandexGPTConfigManagerForIAMToken(
|
| 123 |
-
model_type=model,
|
| 124 |
-
catalog_id=api_dict["catalog_id"],
|
| 125 |
-
iam_token=iam_token
|
| 126 |
-
)
|
| 127 |
client = YandexGPT(config_manager=config)
|
| 128 |
|
| 129 |
messages = [{"role": m["role"], "text": m["content"]} for m in messages]
|
|
@@ -147,6 +145,7 @@ def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=No
|
|
| 147 |
|
| 148 |
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
| 149 |
import openai
|
|
|
|
| 150 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
| 151 |
if api_dict:
|
| 152 |
client = openai.OpenAI(
|
|
@@ -165,8 +164,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
|
|
| 165 |
messages=messages,
|
| 166 |
temperature=temperature,
|
| 167 |
max_tokens=max_tokens,
|
| 168 |
-
stop=["</s>", "<eos>", "<|eot_id|>"]
|
| 169 |
-
|
| 170 |
output = completion.choices[0].message.content
|
| 171 |
break
|
| 172 |
except openai.RateLimitError as e:
|
|
@@ -175,7 +174,7 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
|
|
| 175 |
except openai.BadRequestError as e:
|
| 176 |
print(messages)
|
| 177 |
print(type(e), e)
|
| 178 |
-
except KeyError:
|
| 179 |
print(type(e), e)
|
| 180 |
break
|
| 181 |
|
|
@@ -189,11 +188,7 @@ def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_d
|
|
| 189 |
api_base = api_dict["api_base"]
|
| 190 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
| 191 |
client = AzureOpenAI(
|
| 192 |
-
azure_endpoint =
|
| 193 |
-
api_key= api_key,
|
| 194 |
-
api_version=api_dict["api_version"],
|
| 195 |
-
timeout=240,
|
| 196 |
-
max_retries=2
|
| 197 |
)
|
| 198 |
|
| 199 |
output = API_ERROR_OUTPUT
|
|
@@ -215,7 +210,7 @@ def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_d
|
|
| 215 |
except openai.BadRequestError as e:
|
| 216 |
print(type(e), e)
|
| 217 |
break
|
| 218 |
-
except KeyError:
|
| 219 |
print(type(e), e)
|
| 220 |
break
|
| 221 |
|
|
@@ -246,7 +241,7 @@ def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict
|
|
| 246 |
stop_sequences=[anthropic.HUMAN_PROMPT],
|
| 247 |
max_tokens=max_tokens,
|
| 248 |
temperature=temperature,
|
| 249 |
-
system=sys_msg
|
| 250 |
)
|
| 251 |
output = response.content[0].text
|
| 252 |
break
|
|
@@ -286,25 +281,14 @@ def chat_completion_mistral(model, messages, temperature, max_tokens):
|
|
| 286 |
|
| 287 |
def chat_completion_gemini(model, messages, temperature, max_tokens):
|
| 288 |
import google.generativeai as genai
|
|
|
|
| 289 |
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
| 290 |
|
| 291 |
safety_settings = [
|
| 292 |
-
{
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
},
|
| 296 |
-
{
|
| 297 |
-
"category": "HARM_CATEGORY_HATE_SPEECH",
|
| 298 |
-
"threshold": "BLOCK_NONE"
|
| 299 |
-
},
|
| 300 |
-
{
|
| 301 |
-
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
| 302 |
-
"threshold": "BLOCK_NONE"
|
| 303 |
-
},
|
| 304 |
-
{
|
| 305 |
-
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
| 306 |
-
"threshold": "BLOCK_NONE"
|
| 307 |
-
},
|
| 308 |
]
|
| 309 |
|
| 310 |
# Set up the model
|
|
@@ -319,9 +303,8 @@ def chat_completion_gemini(model, messages, temperature, max_tokens):
|
|
| 319 |
for _ in range(API_MAX_RETRY):
|
| 320 |
try:
|
| 321 |
gemini = genai.GenerativeModel(
|
| 322 |
-
model_name=model,
|
| 323 |
-
|
| 324 |
-
safety_settings=safety_settings)
|
| 325 |
|
| 326 |
convo = gemini.start_chat(history=[])
|
| 327 |
|
|
@@ -344,9 +327,7 @@ def chat_completion_cohere(model, messages, temperature, max_tokens):
|
|
| 344 |
co = cohere.Client(os.environ["COHERE_API_KEY"])
|
| 345 |
assert len(messages) > 0
|
| 346 |
|
| 347 |
-
template_map = {"system":"SYSTEM",
|
| 348 |
-
"assistant":"CHATBOT",
|
| 349 |
-
"user":"USER"}
|
| 350 |
|
| 351 |
assert messages[-1]["role"] == "user"
|
| 352 |
prompt = messages[-1]["content"]
|
|
@@ -354,7 +335,7 @@ def chat_completion_cohere(model, messages, temperature, max_tokens):
|
|
| 354 |
if len(messages) > 1:
|
| 355 |
history = []
|
| 356 |
for message in messages[:-1]:
|
| 357 |
-
history.append({"role":template_map[message["role"]], "message":message["content"]})
|
| 358 |
else:
|
| 359 |
history = None
|
| 360 |
|
|
@@ -384,9 +365,9 @@ def reorg_answer_file(answer_file):
|
|
| 384 |
"""Sort by question id and de-duplication"""
|
| 385 |
answers = {}
|
| 386 |
with open(answer_file, "r") as fin:
|
| 387 |
-
for
|
| 388 |
-
qid = json.loads(
|
| 389 |
-
answers[qid] =
|
| 390 |
|
| 391 |
qids = sorted(list(answers.keys()))
|
| 392 |
with open(answer_file, "w") as fout:
|
|
|
|
| 77 |
return None
|
| 78 |
assert endpoint_list is not None
|
| 79 |
# randomly pick one
|
| 80 |
+
api_dict = random.choices(endpoint_list)[0]
|
|
|
|
|
|
|
| 81 |
return api_dict
|
| 82 |
|
| 83 |
|
|
|
|
| 89 |
|
| 90 |
return config_kwargs
|
| 91 |
|
| 92 |
+
|
| 93 |
def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=None):
|
| 94 |
from gigachat import GigaChat
|
| 95 |
from gigachat.models import Chat, Messages
|
| 96 |
+
|
| 97 |
assert api_dict is not None, "no api settings provided!"
|
| 98 |
auth_token = api_dict.get("auth_token", os.environ.get(api_dict["auth_token"], ""))
|
| 99 |
client = GigaChat(credentials=auth_token, model=model, verify_ssl_certs=False)
|
|
|
|
| 115 |
|
| 116 |
return output
|
| 117 |
|
| 118 |
+
|
| 119 |
def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=None):
|
| 120 |
from yandex_gpt import YandexGPT, YandexGPTConfigManagerForIAMToken
|
| 121 |
+
|
| 122 |
assert api_dict is not None, "no api settings provided!"
|
| 123 |
iam_token = api_dict.get("iam_token", os.environ.get(api_dict["iam_token_ENV"], ""))
|
| 124 |
+
config = YandexGPTConfigManagerForIAMToken(model_type=model, catalog_id=api_dict["catalog_id"], iam_token=iam_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
client = YandexGPT(config_manager=config)
|
| 126 |
|
| 127 |
messages = [{"role": m["role"], "text": m["content"]} for m in messages]
|
|
|
|
| 145 |
|
| 146 |
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
| 147 |
import openai
|
| 148 |
+
|
| 149 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
| 150 |
if api_dict:
|
| 151 |
client = openai.OpenAI(
|
|
|
|
| 164 |
messages=messages,
|
| 165 |
temperature=temperature,
|
| 166 |
max_tokens=max_tokens,
|
| 167 |
+
stop=["</s>", "<eos>", "<|eot_id|>"],
|
| 168 |
+
)
|
| 169 |
output = completion.choices[0].message.content
|
| 170 |
break
|
| 171 |
except openai.RateLimitError as e:
|
|
|
|
| 174 |
except openai.BadRequestError as e:
|
| 175 |
print(messages)
|
| 176 |
print(type(e), e)
|
| 177 |
+
except KeyError as e:
|
| 178 |
print(type(e), e)
|
| 179 |
break
|
| 180 |
|
|
|
|
| 188 |
api_base = api_dict["api_base"]
|
| 189 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
| 190 |
client = AzureOpenAI(
|
| 191 |
+
azure_endpoint=api_base, api_key=api_key, api_version=api_dict["api_version"], timeout=240, max_retries=2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
)
|
| 193 |
|
| 194 |
output = API_ERROR_OUTPUT
|
|
|
|
| 210 |
except openai.BadRequestError as e:
|
| 211 |
print(type(e), e)
|
| 212 |
break
|
| 213 |
+
except KeyError as e:
|
| 214 |
print(type(e), e)
|
| 215 |
break
|
| 216 |
|
|
|
|
| 241 |
stop_sequences=[anthropic.HUMAN_PROMPT],
|
| 242 |
max_tokens=max_tokens,
|
| 243 |
temperature=temperature,
|
| 244 |
+
system=sys_msg,
|
| 245 |
)
|
| 246 |
output = response.content[0].text
|
| 247 |
break
|
|
|
|
| 281 |
|
| 282 |
def chat_completion_gemini(model, messages, temperature, max_tokens):
|
| 283 |
import google.generativeai as genai
|
| 284 |
+
|
| 285 |
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
| 286 |
|
| 287 |
safety_settings = [
|
| 288 |
+
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
|
| 289 |
+
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
|
| 290 |
+
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
|
| 291 |
+
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
]
|
| 293 |
|
| 294 |
# Set up the model
|
|
|
|
| 303 |
for _ in range(API_MAX_RETRY):
|
| 304 |
try:
|
| 305 |
gemini = genai.GenerativeModel(
|
| 306 |
+
model_name=model, generation_config=generation_config, safety_settings=safety_settings
|
| 307 |
+
)
|
|
|
|
| 308 |
|
| 309 |
convo = gemini.start_chat(history=[])
|
| 310 |
|
|
|
|
| 327 |
co = cohere.Client(os.environ["COHERE_API_KEY"])
|
| 328 |
assert len(messages) > 0
|
| 329 |
|
| 330 |
+
template_map = {"system": "SYSTEM", "assistant": "CHATBOT", "user": "USER"}
|
|
|
|
|
|
|
| 331 |
|
| 332 |
assert messages[-1]["role"] == "user"
|
| 333 |
prompt = messages[-1]["content"]
|
|
|
|
| 335 |
if len(messages) > 1:
|
| 336 |
history = []
|
| 337 |
for message in messages[:-1]:
|
| 338 |
+
history.append({"role": template_map[message["role"]], "message": message["content"]})
|
| 339 |
else:
|
| 340 |
history = None
|
| 341 |
|
|
|
|
| 365 |
"""Sort by question id and de-duplication"""
|
| 366 |
answers = {}
|
| 367 |
with open(answer_file, "r") as fin:
|
| 368 |
+
for line in fin:
|
| 369 |
+
qid = json.loads(line)["question_id"]
|
| 370 |
+
answers[qid] = line
|
| 371 |
|
| 372 |
qids = sorted(list(answers.keys()))
|
| 373 |
with open(answer_file, "w") as fout:
|
src/leaderboard/build_leaderboard.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import os
|
|
@@ -11,7 +10,8 @@ from huggingface_hub import snapshot_download
|
|
| 11 |
from src.envs import EVAL_RESULTS_PATH
|
| 12 |
|
| 13 |
# Configure logging
|
| 14 |
-
logging.basicConfig(level=logging.INFO, format=
|
|
|
|
| 15 |
|
| 16 |
def time_diff_wrapper(func):
|
| 17 |
def wrapper(*args, **kwargs):
|
|
@@ -21,15 +21,17 @@ def time_diff_wrapper(func):
|
|
| 21 |
diff = end_time - start_time
|
| 22 |
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
| 23 |
return result
|
|
|
|
| 24 |
return wrapper
|
| 25 |
|
|
|
|
| 26 |
@time_diff_wrapper
|
| 27 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 28 |
"""Download dataset with exponential backoff retries."""
|
| 29 |
attempt = 0
|
| 30 |
while attempt < max_attempts:
|
| 31 |
try:
|
| 32 |
-
logging.info(
|
| 33 |
snapshot_download(
|
| 34 |
repo_id=repo_id,
|
| 35 |
local_dir=local_dir,
|
|
@@ -42,27 +44,41 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
| 42 |
logging.info("Download successful")
|
| 43 |
return
|
| 44 |
except Exception as e:
|
| 45 |
-
wait_time = backoff_factor
|
| 46 |
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
| 47 |
time.sleep(wait_time)
|
| 48 |
attempt += 1
|
| 49 |
logging.error(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 50 |
|
|
|
|
| 51 |
def build_leadearboard_df():
|
| 52 |
"""Initializes the application space, loading only necessary data."""
|
| 53 |
-
# Check ENV LEADERBOARD_DOWNLOAD if wee need to download the leaderboard
|
| 54 |
-
if os.getenv("LEADERBOARD_DOWNLOAD", "True") == "True":
|
| 55 |
-
# These downloads only occur on full initialization
|
| 56 |
-
# try:
|
| 57 |
-
# download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 58 |
-
# download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
| 59 |
-
download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
|
| 60 |
-
# print(subprocess.Popen('ls src'))
|
| 61 |
-
subprocess.run(['rsync', '-avzP', '--ignore-existing', f'{EVAL_RESULTS_PATH[2:]}/external/*', 'src/gen/data/arena-hard-v0.1/model_answer/'], check=False)
|
| 62 |
-
subprocess.run(['rsync', '-avzP', '--ignore-existing', f'{EVAL_RESULTS_PATH[2:]}/model_judgment/*', 'src/gen/data/arena-hard-v0.1/model_judgement/'], check=False)
|
| 63 |
-
# except Exception:
|
| 64 |
-
# restart_space()
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
return leaderboard_df.copy()
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import logging
|
| 3 |
import os
|
|
|
|
| 10 |
from src.envs import EVAL_RESULTS_PATH
|
| 11 |
|
| 12 |
# Configure logging
|
| 13 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 14 |
+
|
| 15 |
|
| 16 |
def time_diff_wrapper(func):
|
| 17 |
def wrapper(*args, **kwargs):
|
|
|
|
| 21 |
diff = end_time - start_time
|
| 22 |
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
| 23 |
return result
|
| 24 |
+
|
| 25 |
return wrapper
|
| 26 |
|
| 27 |
+
|
| 28 |
@time_diff_wrapper
|
| 29 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 30 |
"""Download dataset with exponential backoff retries."""
|
| 31 |
attempt = 0
|
| 32 |
while attempt < max_attempts:
|
| 33 |
try:
|
| 34 |
+
logging.info("Downloading %s to %s", repo_id, local_dir)
|
| 35 |
snapshot_download(
|
| 36 |
repo_id=repo_id,
|
| 37 |
local_dir=local_dir,
|
|
|
|
| 44 |
logging.info("Download successful")
|
| 45 |
return
|
| 46 |
except Exception as e:
|
| 47 |
+
wait_time = backoff_factor**attempt
|
| 48 |
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
| 49 |
time.sleep(wait_time)
|
| 50 |
attempt += 1
|
| 51 |
logging.error(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 52 |
|
| 53 |
+
|
| 54 |
def build_leadearboard_df():
|
| 55 |
"""Initializes the application space, loading only necessary data."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
# download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
| 58 |
+
# download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
| 59 |
+
download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
|
| 60 |
+
# print(subprocess.Popen('ls src'))
|
| 61 |
+
subprocess.run(
|
| 62 |
+
[
|
| 63 |
+
"rsync",
|
| 64 |
+
"-avzP",
|
| 65 |
+
"--ignore-existing",
|
| 66 |
+
f"{EVAL_RESULTS_PATH}/external/*",
|
| 67 |
+
"src/gen/data/arena-hard-v0.1/model_answer/",
|
| 68 |
+
],
|
| 69 |
+
check=False,
|
| 70 |
+
)
|
| 71 |
+
subprocess.run(
|
| 72 |
+
[
|
| 73 |
+
"rsync",
|
| 74 |
+
"-avzP",
|
| 75 |
+
"--ignore-existing",
|
| 76 |
+
f"{EVAL_RESULTS_PATH}/model_judgment/*",
|
| 77 |
+
"src/gen/data/arena-hard-v0.1/model_judgement/",
|
| 78 |
+
],
|
| 79 |
+
check=False,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Retrieve the leaderboard DataFrame
|
| 83 |
+
leaderboard_df = pd.DataFrame.from_records(json.load(open("eval-results/evals/upd.json", "r")))
|
| 84 |
return leaderboard_df.copy()
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -137,9 +137,9 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 137 |
if model_data[AutoEvalColumn.not_flagged.name]:
|
| 138 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
| 139 |
else:
|
| 140 |
-
|
| 141 |
flag_key = "merged"
|
| 142 |
-
|
| 143 |
# Reverse the logic: Check for non-flagged models instead
|
| 144 |
if flag_key in FLAGGED_MODELS:
|
| 145 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
|
@@ -147,9 +147,9 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 147 |
FLAGGED_MODELS[flag_key],
|
| 148 |
f"See discussion #{issue_num}",
|
| 149 |
)
|
| 150 |
-
model_data[
|
| 151 |
-
|
| 152 |
-
|
| 153 |
model_data[AutoEvalColumn.not_flagged.name] = False
|
| 154 |
else:
|
| 155 |
model_data[AutoEvalColumn.not_flagged.name] = True
|
|
@@ -171,4 +171,3 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
| 171 |
def filter_models_flags(leaderboard_data: list[dict]):
|
| 172 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
| 173 |
flag_models(leaderboard_data)
|
| 174 |
-
|
|
|
|
| 137 |
if model_data[AutoEvalColumn.not_flagged.name]:
|
| 138 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
| 139 |
else:
|
| 140 |
+
# Merges and moes are flagged
|
| 141 |
flag_key = "merged"
|
| 142 |
+
|
| 143 |
# Reverse the logic: Check for non-flagged models instead
|
| 144 |
if flag_key in FLAGGED_MODELS:
|
| 145 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
|
|
|
| 147 |
FLAGGED_MODELS[flag_key],
|
| 148 |
f"See discussion #{issue_num}",
|
| 149 |
)
|
| 150 |
+
model_data[
|
| 151 |
+
AutoEvalColumn.model.name
|
| 152 |
+
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
| 153 |
model_data[AutoEvalColumn.not_flagged.name] = False
|
| 154 |
else:
|
| 155 |
model_data[AutoEvalColumn.not_flagged.name] = True
|
|
|
|
| 171 |
def filter_models_flags(leaderboard_data: list[dict]):
|
| 172 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
| 173 |
flag_models(leaderboard_data)
|
|
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -16,36 +16,36 @@ from src.display.formatting import make_clickable_model
|
|
| 16 |
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
| 17 |
|
| 18 |
# Configure logging
|
| 19 |
-
logging.basicConfig(level=logging.INFO, format=
|
|
|
|
| 20 |
|
| 21 |
@dataclass
|
| 22 |
class EvalResult:
|
| 23 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
| 24 |
-
eval_name: str
|
| 25 |
-
full_model: str
|
| 26 |
org: Optional[str]
|
| 27 |
model: str
|
| 28 |
-
revision: str
|
| 29 |
results: Dict[str, float]
|
| 30 |
precision: Precision = Precision.Unknown
|
| 31 |
-
model_type: ModelType = ModelType.Unknown
|
| 32 |
weight_type: WeightType = WeightType.Original
|
| 33 |
-
architecture: str = "Unknown"
|
| 34 |
license: str = "?"
|
| 35 |
likes: int = 0
|
| 36 |
num_params: int = 0
|
| 37 |
-
date: str = ""
|
| 38 |
still_on_hub: bool = True
|
| 39 |
is_merge: bool = False
|
| 40 |
not_flagged: bool = False
|
| 41 |
status: str = "FINISHED"
|
| 42 |
# List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
|
| 43 |
tags: List[str] = field(default_factory=list)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
@classmethod
|
| 47 |
-
def init_from_json_file(cls, json_filepath: str) ->
|
| 48 |
-
with open(json_filepath,
|
| 49 |
data = json.load(fp)
|
| 50 |
|
| 51 |
config = data.get("config_general", {})
|
|
@@ -72,7 +72,7 @@ class EvalResult:
|
|
| 72 |
model=model,
|
| 73 |
results=results,
|
| 74 |
precision=precision,
|
| 75 |
-
revision=config.get("model_sha", "")
|
| 76 |
)
|
| 77 |
|
| 78 |
@staticmethod
|
|
@@ -118,9 +118,8 @@ class EvalResult:
|
|
| 118 |
|
| 119 |
mean_acc = np.mean(accs) * 100.0
|
| 120 |
results[task.benchmark] = mean_acc
|
| 121 |
-
|
| 122 |
-
return results
|
| 123 |
|
|
|
|
| 124 |
|
| 125 |
def update_with_request_file(self, requests_path):
|
| 126 |
"""Finds the relevant request file for the current model and updates info with it."""
|
|
@@ -130,17 +129,17 @@ class EvalResult:
|
|
| 130 |
logging.warning(f"No request file for {self.org}/{self.model}")
|
| 131 |
self.status = "FAILED"
|
| 132 |
return
|
| 133 |
-
|
| 134 |
with open(request_file, "r") as f:
|
| 135 |
request = json.load(f)
|
| 136 |
-
|
| 137 |
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
| 138 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
| 139 |
self.num_params = int(request.get("params", 0)) # Ensuring type safety
|
| 140 |
self.date = request.get("submitted_time", "")
|
| 141 |
self.architecture = request.get("architectures", "Unknown")
|
| 142 |
self.status = request.get("status", "FAILED")
|
| 143 |
-
|
| 144 |
except FileNotFoundError:
|
| 145 |
self.status = "FAILED"
|
| 146 |
logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
|
|
@@ -154,7 +153,6 @@ class EvalResult:
|
|
| 154 |
self.status = "FAILED"
|
| 155 |
logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
|
| 156 |
|
| 157 |
-
|
| 158 |
def update_with_dynamic_file_dict(self, file_dict):
|
| 159 |
"""Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
|
| 160 |
# Default values set for optional or potentially missing keys.
|
|
@@ -162,11 +160,10 @@ class EvalResult:
|
|
| 162 |
self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
|
| 163 |
self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
|
| 164 |
self.tags = file_dict.get("tags", [])
|
| 165 |
-
|
| 166 |
# Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
|
| 167 |
self.not_flagged = not (any("flagged" in tag for tag in self.tags))
|
| 168 |
|
| 169 |
-
|
| 170 |
def to_dict(self):
|
| 171 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 172 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
@@ -185,8 +182,10 @@ class EvalResult:
|
|
| 185 |
AutoEvalColumn.likes.name: self.likes,
|
| 186 |
AutoEvalColumn.params.name: self.num_params,
|
| 187 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 188 |
-
AutoEvalColumn.merged.name: not(
|
| 189 |
-
AutoEvalColumn.moe.name: not (
|
|
|
|
|
|
|
| 190 |
AutoEvalColumn.not_flagged.name: self.not_flagged,
|
| 191 |
}
|
| 192 |
|
|
@@ -194,16 +193,16 @@ class EvalResult:
|
|
| 194 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 195 |
|
| 196 |
return data_dict
|
| 197 |
-
|
| 198 |
|
| 199 |
def get_request_file_for_model(requests_path, model_name, precision):
|
| 200 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 201 |
requests_path = Path(requests_path)
|
| 202 |
pattern = f"{model_name}_eval_request_*.json"
|
| 203 |
-
|
| 204 |
# Using pathlib to find files matching the pattern
|
| 205 |
request_files = list(requests_path.glob(pattern))
|
| 206 |
-
|
| 207 |
# Sort the files by name in descending order to mimic 'reverse=True'
|
| 208 |
request_files.sort(reverse=True)
|
| 209 |
|
|
@@ -214,7 +213,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 214 |
req_content = json.load(f)
|
| 215 |
if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
|
| 216 |
request_file = str(request_file)
|
| 217 |
-
|
| 218 |
# Return empty string if no file found that matches criteria
|
| 219 |
return request_file
|
| 220 |
|
|
@@ -223,9 +222,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
| 223 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 224 |
with open(dynamic_path) as f:
|
| 225 |
dynamic_data = json.load(f)
|
| 226 |
-
|
| 227 |
results_path = Path(results_path)
|
| 228 |
-
model_files = list(results_path.rglob(
|
| 229 |
model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
|
| 230 |
|
| 231 |
eval_results = {}
|
|
@@ -260,4 +259,3 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
| 260 |
continue
|
| 261 |
|
| 262 |
return results
|
| 263 |
-
|
|
|
|
| 16 |
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
| 17 |
|
| 18 |
# Configure logging
|
| 19 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 20 |
+
|
| 21 |
|
| 22 |
@dataclass
|
| 23 |
class EvalResult:
|
| 24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
| 25 |
+
eval_name: str # org_model_precision (uid)
|
| 26 |
+
full_model: str # org/model (path on hub)
|
| 27 |
org: Optional[str]
|
| 28 |
model: str
|
| 29 |
+
revision: str # commit hash, "" if main
|
| 30 |
results: Dict[str, float]
|
| 31 |
precision: Precision = Precision.Unknown
|
| 32 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 33 |
weight_type: WeightType = WeightType.Original
|
| 34 |
+
architecture: str = "Unknown" # From config file
|
| 35 |
license: str = "?"
|
| 36 |
likes: int = 0
|
| 37 |
num_params: int = 0
|
| 38 |
+
date: str = "" # submission date of request file
|
| 39 |
still_on_hub: bool = True
|
| 40 |
is_merge: bool = False
|
| 41 |
not_flagged: bool = False
|
| 42 |
status: str = "FINISHED"
|
| 43 |
# List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
|
| 44 |
tags: List[str] = field(default_factory=list)
|
| 45 |
+
|
|
|
|
| 46 |
@classmethod
|
| 47 |
+
def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
|
| 48 |
+
with open(json_filepath, "r") as fp:
|
| 49 |
data = json.load(fp)
|
| 50 |
|
| 51 |
config = data.get("config_general", {})
|
|
|
|
| 72 |
model=model,
|
| 73 |
results=results,
|
| 74 |
precision=precision,
|
| 75 |
+
revision=config.get("model_sha", ""),
|
| 76 |
)
|
| 77 |
|
| 78 |
@staticmethod
|
|
|
|
| 118 |
|
| 119 |
mean_acc = np.mean(accs) * 100.0
|
| 120 |
results[task.benchmark] = mean_acc
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
return results
|
| 123 |
|
| 124 |
def update_with_request_file(self, requests_path):
|
| 125 |
"""Finds the relevant request file for the current model and updates info with it."""
|
|
|
|
| 129 |
logging.warning(f"No request file for {self.org}/{self.model}")
|
| 130 |
self.status = "FAILED"
|
| 131 |
return
|
| 132 |
+
|
| 133 |
with open(request_file, "r") as f:
|
| 134 |
request = json.load(f)
|
| 135 |
+
|
| 136 |
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
| 137 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
| 138 |
self.num_params = int(request.get("params", 0)) # Ensuring type safety
|
| 139 |
self.date = request.get("submitted_time", "")
|
| 140 |
self.architecture = request.get("architectures", "Unknown")
|
| 141 |
self.status = request.get("status", "FAILED")
|
| 142 |
+
|
| 143 |
except FileNotFoundError:
|
| 144 |
self.status = "FAILED"
|
| 145 |
logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
|
|
|
|
| 153 |
self.status = "FAILED"
|
| 154 |
logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
|
| 155 |
|
|
|
|
| 156 |
def update_with_dynamic_file_dict(self, file_dict):
|
| 157 |
"""Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
|
| 158 |
# Default values set for optional or potentially missing keys.
|
|
|
|
| 160 |
self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
|
| 161 |
self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
|
| 162 |
self.tags = file_dict.get("tags", [])
|
| 163 |
+
|
| 164 |
# Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
|
| 165 |
self.not_flagged = not (any("flagged" in tag for tag in self.tags))
|
| 166 |
|
|
|
|
| 167 |
def to_dict(self):
|
| 168 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 169 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
|
| 182 |
AutoEvalColumn.likes.name: self.likes,
|
| 183 |
AutoEvalColumn.params.name: self.num_params,
|
| 184 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 185 |
+
AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
|
| 186 |
+
AutoEvalColumn.moe.name: not (
|
| 187 |
+
("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
|
| 188 |
+
),
|
| 189 |
AutoEvalColumn.not_flagged.name: self.not_flagged,
|
| 190 |
}
|
| 191 |
|
|
|
|
| 193 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 194 |
|
| 195 |
return data_dict
|
| 196 |
+
|
| 197 |
|
| 198 |
def get_request_file_for_model(requests_path, model_name, precision):
|
| 199 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 200 |
requests_path = Path(requests_path)
|
| 201 |
pattern = f"{model_name}_eval_request_*.json"
|
| 202 |
+
|
| 203 |
# Using pathlib to find files matching the pattern
|
| 204 |
request_files = list(requests_path.glob(pattern))
|
| 205 |
+
|
| 206 |
# Sort the files by name in descending order to mimic 'reverse=True'
|
| 207 |
request_files.sort(reverse=True)
|
| 208 |
|
|
|
|
| 213 |
req_content = json.load(f)
|
| 214 |
if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
|
| 215 |
request_file = str(request_file)
|
| 216 |
+
|
| 217 |
# Return empty string if no file found that matches criteria
|
| 218 |
return request_file
|
| 219 |
|
|
|
|
| 222 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 223 |
with open(dynamic_path) as f:
|
| 224 |
dynamic_data = json.load(f)
|
| 225 |
+
|
| 226 |
results_path = Path(results_path)
|
| 227 |
+
model_files = list(results_path.rglob("results_*.json"))
|
| 228 |
model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
|
| 229 |
|
| 230 |
eval_results = {}
|
|
|
|
| 259 |
continue
|
| 260 |
|
| 261 |
return results
|
|
|
src/populate.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
import pathlib
|
| 4 |
import pandas as pd
|
| 5 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
@@ -21,7 +19,7 @@ def get_evaluation_queue_df(save_path, cols):
|
|
| 21 |
save_path = pathlib.Path(save_path)
|
| 22 |
all_evals = []
|
| 23 |
|
| 24 |
-
for path in save_path.rglob(
|
| 25 |
data = load_json_data(path)
|
| 26 |
if data:
|
| 27 |
all_evals.append(_process_model_data(data))
|
|
|
|
|
|
|
|
|
|
| 1 |
import pathlib
|
| 2 |
import pandas as pd
|
| 3 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
|
| 19 |
save_path = pathlib.Path(save_path)
|
| 20 |
all_evals = []
|
| 21 |
|
| 22 |
+
for path in save_path.rglob("*.json"):
|
| 23 |
data = load_json_data(path)
|
| 24 |
if data:
|
| 25 |
all_evals.append(_process_model_data(data))
|
src/scripts/create_request_file.py
CHANGED
|
@@ -47,7 +47,7 @@ def main():
|
|
| 47 |
eval_entry = {
|
| 48 |
"model": model_name,
|
| 49 |
"base_model": base_model,
|
| 50 |
-
"revision": model_info.sha,
|
| 51 |
"private": False,
|
| 52 |
"precision": precision,
|
| 53 |
"weight_type": weight_type,
|
|
|
|
| 47 |
eval_entry = {
|
| 48 |
"model": model_name,
|
| 49 |
"base_model": base_model,
|
| 50 |
+
"revision": model_info.sha, # force to use the exact model commit
|
| 51 |
"private": False,
|
| 52 |
"precision": precision,
|
| 53 |
"weight_type": weight_type,
|
src/scripts/update_all_request_files.py
CHANGED
|
@@ -91,6 +91,6 @@ def update_models(file_path, models_on_the_hub):
|
|
| 91 |
|
| 92 |
def update_dynamic_files():
|
| 93 |
# from gen import gen_answer,gen_judgment\
|
| 94 |
-
subprocess.Popen(
|
| 95 |
|
| 96 |
-
subprocess.Popen(
|
|
|
|
| 91 |
|
| 92 |
def update_dynamic_files():
|
| 93 |
# from gen import gen_answer,gen_judgment\
|
| 94 |
+
subprocess.Popen("python3 ../gen/gen_judgement.py")
|
| 95 |
|
| 96 |
+
subprocess.Popen("python3 ../gen/show_result.py --output")
|
src/submission/check_validity.py
CHANGED
|
@@ -49,7 +49,7 @@ def is_model_on_hub(
|
|
| 49 |
) # , force_download=True)
|
| 50 |
if test_tokenizer:
|
| 51 |
try:
|
| 52 |
-
|
| 53 |
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
| 54 |
)
|
| 55 |
except ValueError as e:
|
|
|
|
| 49 |
) # , force_download=True)
|
| 50 |
if test_tokenizer:
|
| 51 |
try:
|
| 52 |
+
AutoTokenizer.from_pretrained(
|
| 53 |
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
| 54 |
)
|
| 55 |
except ValueError as e:
|
src/submission/submit.py
CHANGED
|
@@ -1,21 +1,4 @@
|
|
| 1 |
-
import
|
| 2 |
-
import os
|
| 3 |
-
from datetime import datetime, timezone
|
| 4 |
-
|
| 5 |
-
from huggingface_hub import snapshot_download
|
| 6 |
-
|
| 7 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 8 |
-
from src.envs import (
|
| 9 |
-
API,
|
| 10 |
-
DYNAMIC_INFO_FILE_PATH,
|
| 11 |
-
DYNAMIC_INFO_PATH,
|
| 12 |
-
DYNAMIC_INFO_REPO,
|
| 13 |
-
EVAL_REQUESTS_PATH,
|
| 14 |
-
H4_TOKEN,
|
| 15 |
-
QUEUE_REPO,
|
| 16 |
-
RATE_LIMIT_PERIOD,
|
| 17 |
-
RATE_LIMIT_QUOTA,
|
| 18 |
-
)
|
| 19 |
# from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
| 20 |
# from src.submission.check_validity import (
|
| 21 |
# already_submitted_models,
|
|
@@ -38,7 +21,6 @@ def add_new_eval(
|
|
| 38 |
# if not REQUESTED_MODELS:
|
| 39 |
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 40 |
|
| 41 |
-
|
| 42 |
# user_name = ""
|
| 43 |
# model_path = model
|
| 44 |
# if "/" in model:
|
|
@@ -186,6 +168,4 @@ def add_new_eval(
|
|
| 186 |
# # Remove the local file
|
| 187 |
# os.remove(out_path)
|
| 188 |
|
| 189 |
-
return styled_message(
|
| 190 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour."
|
| 191 |
-
)
|
|
|
|
| 1 |
+
from src.display.formatting import styled_message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
# from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
| 3 |
# from src.submission.check_validity import (
|
| 4 |
# already_submitted_models,
|
|
|
|
| 21 |
# if not REQUESTED_MODELS:
|
| 22 |
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 23 |
|
|
|
|
| 24 |
# user_name = ""
|
| 25 |
# model_path = model
|
| 26 |
# if "/" in model:
|
|
|
|
| 168 |
# # Remove the local file
|
| 169 |
# os.remove(out_path)
|
| 170 |
|
| 171 |
+
return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour.")
|
|
|
|
|
|
src/tools/plots.py
CHANGED
|
@@ -3,7 +3,7 @@ import pandas as pd
|
|
| 3 |
import plotly.express as px
|
| 4 |
from plotly.graph_objs import Figure
|
| 5 |
|
| 6 |
-
from src.display.utils import
|
| 7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
| 8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
| 9 |
from src.leaderboard.read_evals import EvalResult
|
|
|
|
| 3 |
import plotly.express as px
|
| 4 |
from plotly.graph_objs import Figure
|
| 5 |
|
| 6 |
+
from src.display.utils import AutoEvalColumn, Task, Tasks
|
| 7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
| 8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
| 9 |
from src.leaderboard.read_evals import EvalResult
|