Spaces:

SaylorTwift
/

mt-bench-viz-no-compare

Runtime error

App Files Files Community

Nathan Habib commited on Apr 2, 2024

Commit

bb3c194

1 Parent(s): 56f8b5d

commit

Browse files

Files changed (3) hide show

app.py +91 -0
requirements.txt +1 -0
utils.py +114 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+import pandas as pd
+import os
+from utils import construct_dataframe, MODELS, get_scores
+HF_TOKEN = os.getenv("HF_TOKEN")
+DATAFRAME: pd.DataFrame = construct_dataframe()
+MAX_LINES = 500
+MIN_LINES = 10
+def get_from_question_id_turn_2(model, question_id: int):
+    new = DATAFRAME.loc[question_id]
+    new = new[new["turn"] == 1]
+    new = new[new["model"] == model]
+    prompt_lighteval = new["prompt"].values[0]
+    response_lighteval = new["response"].values[0]
+    judgement_prompt_lighteval = new["judgement_prompt"].values[0]
+    judgement_lighteval = new["judgment"].values[0]
+    score_lighteval = new["score"].values[0]
+    return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
+def get_from_question_id_turn_1(model, question_id: int):
+    new = DATAFRAME.loc[question_id]
+    new = new[new["turn"] == 0]
+    new = new[new["model"] == model]
+    prompt_lighteval = new["prompt"].values[0]
+    response_lighteval = new["response"].values[0]
+    judgement_prompt_lighteval = new["judgement_prompt"].values[0]
+    judgement_lighteval = new["judgment"].values[0]
+    score_lighteval = new["score"].values[0]
+    return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
+with gr.Blocks() as demo:
+    with gr.Row():
+        model = gr.Dropdown([model.split("__")[1] for model in MODELS], label="Model")
+        index = gr.Dropdown(set(DATAFRAME.index.values.tolist()), label="Index", value=DATAFRAME.index.values.tolist()[0])
+    with gr.Row():
+        gr.DataFrame(get_scores(DATAFRAME).reset_index(), interactive=False, )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## Turn 1")
+            score_lighteval = gr.Number(label="Score", interactive=False)
+            prompt_lighteval = gr.Textbox(
+                label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
+            )
+            response_lighteval = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
+            judgement_prompt_lighteval = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
+            judgement_lighteval = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
+        with gr.Column():
+            gr.Markdown("## Turn 2")
+            score_lighteval_2 = gr.Number(label="Score", interactive=False)
+            prompt_lighteval_2 = gr.Textbox(
+                label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
+            )
+            response_lighteval_2 = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
+            judgement_prompt_lighteval_2 = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
+            judgement_lighteval_2 = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
+    index.change(
+        fn=get_from_question_id_turn_1,
+        inputs=[model, index],
+        outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval],
+    )
+    index.change(
+        fn=get_from_question_id_turn_2,
+        inputs=[model, index],
+        outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
+     )
+    model.change(
+        fn=get_from_question_id_turn_2,
+        inputs=[model, index],
+        outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
+     )
+    model.change(
+        fn=get_from_question_id_turn_1,
+        inputs=[model, index],
+        outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval,],
+     )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ plotly

utils.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import pandas as pd
+from datasets import load_dataset
+import os
+import json
+from pprint import pprint
+pd.options.plotting.backend = "plotly"
+MODELS = [
+    "mistralai__Mistral-7B-Instruct-v0.2",
+    # "HuggingFaceH4__zephyr-7b-beta",
+    # "meta-llama__Llama-2-7b-chat-hf",
+    # "01-ai__Yi-34B-Chat",
+]
+HF_TOKEN = os.getenv("HF_TOKEN")
+score_turn = {
+    1: "multi_turn",
+    0: "single_turn",
+}
+def get_dataframe_lighteval() -> pd.DataFrame:
+    samples = []
+    scores = []
+    for model in MODELS:
+        details_lighteval = load_dataset(
+            f"SaylorTwift/details_{model}_private",
+            "extended_mt_bench_0",
+            split="latest",
+            token=HF_TOKEN,
+        )
+        for d in details_lighteval:
+            judement_prompt = d["judement_prompt"]
+            judgement = d["judgement"]
+            predictions = d["predictions"][0]
+            prompts = d["full_prompt"]
+            turns = []
+            for turn in range(len(predictions)):
+                if turn == 1:
+                    prompt = prompts[turn].format(model_response=predictions[turn - 1])
+                else:
+                    prompt = prompts[turn]
+                turns.append([])
+                turns[turn].append(prompt)
+                turns[turn].append(predictions[turn])
+                turns[turn].append(judement_prompt[turn])
+                turns[turn].append(judgement[turn])
+            for i, turn in enumerate(turns):
+                samples.append(
+                    {
+                        "model": model,
+                        "turn": i,
+                        "prompt": turn[0],
+                        "response": turn[1],
+                        "judgement_prompt": turn[2],
+                        "judgment": turn[3],
+                        "score": d["metrics"][score_turn[i]],
+                        "question_id": d["specifics"]["id"],
+                    }
+                )
+    dataframe_all_samples = pd.DataFrame(samples)
+    return dataframe_all_samples
+def construct_dataframe() -> pd.DataFrame:
+    """
+    Construct a dataframe from the data in the data folder
+    """
+    lighteval = get_dataframe_lighteval()
+    lighteval["model"] = lighteval["model"].apply(lambda x: x.split("__")[1])
+    lighteval = lighteval.set_index(["question_id", "turn", "model"])
+    all_samples = lighteval.reset_index()
+    all_samples = all_samples.set_index("question_id")
+    return all_samples.dropna()
+def create_plot(model: str, dataframe: pd.DataFrame):
+    new = dataframe[dataframe["model"] == model].dropna()
+    new = new[new["turn"] == 1]
+    new["score_lighteval"] = new["score_lighteval"].astype(int)
+    new["score_mt_bench"] = new["score_mt_bench"].astype(int)
+    new = new[['score_lighteval', 'score_mt_bench']]
+    new.index = new.index.astype(str)
+    fig = new.plot.bar(title="Scores", labels={"index": "Index", "value": "Score"}, barmode="group")
+    return fig
+def get_scores(dataframe):
+    dataframe = dataframe.dropna()
+    dataframe["score"] = dataframe["score"].astype(int)
+    new = dataframe[['score', "turn", "model"]]
+    new = new.groupby(["model", "turn"]).mean()
+    new = new.groupby(["model"]).mean()
+    return new
+if __name__ == "__main__":
+    df = construct_dataframe()
+    from pprint import pprint
+    pprint(df)
+    #print(df.iloc[130])
+    # model = "zephyr-7b-beta"
+    # fig = create_plot(model, df)
+    # fig.show()