Spaces:
Runtime error
Runtime error
Nathan Habib
commited on
Commit
·
bb3c194
1
Parent(s):
56f8b5d
commit
Browse files- app.py +91 -0
- requirements.txt +1 -0
- utils.py +114 -0
app.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
from utils import construct_dataframe, MODELS, get_scores
|
| 5 |
+
|
| 6 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 7 |
+
DATAFRAME: pd.DataFrame = construct_dataframe()
|
| 8 |
+
MAX_LINES = 500
|
| 9 |
+
MIN_LINES = 10
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_from_question_id_turn_2(model, question_id: int):
|
| 13 |
+
new = DATAFRAME.loc[question_id]
|
| 14 |
+
new = new[new["turn"] == 1]
|
| 15 |
+
new = new[new["model"] == model]
|
| 16 |
+
|
| 17 |
+
prompt_lighteval = new["prompt"].values[0]
|
| 18 |
+
response_lighteval = new["response"].values[0]
|
| 19 |
+
judgement_prompt_lighteval = new["judgement_prompt"].values[0]
|
| 20 |
+
judgement_lighteval = new["judgment"].values[0]
|
| 21 |
+
score_lighteval = new["score"].values[0]
|
| 22 |
+
|
| 23 |
+
return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_from_question_id_turn_1(model, question_id: int):
|
| 27 |
+
new = DATAFRAME.loc[question_id]
|
| 28 |
+
new = new[new["turn"] == 0]
|
| 29 |
+
new = new[new["model"] == model]
|
| 30 |
+
|
| 31 |
+
prompt_lighteval = new["prompt"].values[0]
|
| 32 |
+
response_lighteval = new["response"].values[0]
|
| 33 |
+
judgement_prompt_lighteval = new["judgement_prompt"].values[0]
|
| 34 |
+
judgement_lighteval = new["judgment"].values[0]
|
| 35 |
+
score_lighteval = new["score"].values[0]
|
| 36 |
+
|
| 37 |
+
return prompt_lighteval, response_lighteval, judgement_prompt_lighteval[1]["content"], judgement_lighteval, score_lighteval
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
with gr.Blocks() as demo:
|
| 41 |
+
with gr.Row():
|
| 42 |
+
model = gr.Dropdown([model.split("__")[1] for model in MODELS], label="Model")
|
| 43 |
+
index = gr.Dropdown(set(DATAFRAME.index.values.tolist()), label="Index", value=DATAFRAME.index.values.tolist()[0])
|
| 44 |
+
|
| 45 |
+
with gr.Row():
|
| 46 |
+
gr.DataFrame(get_scores(DATAFRAME).reset_index(), interactive=False, )
|
| 47 |
+
|
| 48 |
+
with gr.Row():
|
| 49 |
+
with gr.Column():
|
| 50 |
+
gr.Markdown("## Turn 1")
|
| 51 |
+
score_lighteval = gr.Number(label="Score", interactive=False)
|
| 52 |
+
prompt_lighteval = gr.Textbox(
|
| 53 |
+
label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
|
| 54 |
+
)
|
| 55 |
+
response_lighteval = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
| 56 |
+
judgement_prompt_lighteval = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
| 57 |
+
judgement_lighteval = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
| 58 |
+
with gr.Column():
|
| 59 |
+
gr.Markdown("## Turn 2")
|
| 60 |
+
score_lighteval_2 = gr.Number(label="Score", interactive=False)
|
| 61 |
+
prompt_lighteval_2 = gr.Textbox(
|
| 62 |
+
label="Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES
|
| 63 |
+
)
|
| 64 |
+
response_lighteval_2 = gr.Textbox(label="Response", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
| 65 |
+
judgement_prompt_lighteval_2 = gr.Textbox(label="Judgement Prompt", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
| 66 |
+
judgement_lighteval_2 = gr.Textbox(label="Judgement", interactive=False, max_lines=MAX_LINES, lines=MIN_LINES)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
index.change(
|
| 70 |
+
fn=get_from_question_id_turn_1,
|
| 71 |
+
inputs=[model, index],
|
| 72 |
+
outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval],
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
index.change(
|
| 76 |
+
fn=get_from_question_id_turn_2,
|
| 77 |
+
inputs=[model, index],
|
| 78 |
+
outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
|
| 79 |
+
)
|
| 80 |
+
model.change(
|
| 81 |
+
fn=get_from_question_id_turn_2,
|
| 82 |
+
inputs=[model, index],
|
| 83 |
+
outputs=[prompt_lighteval_2, response_lighteval_2, judgement_prompt_lighteval_2, judgement_lighteval_2, score_lighteval_2],
|
| 84 |
+
)
|
| 85 |
+
model.change(
|
| 86 |
+
fn=get_from_question_id_turn_1,
|
| 87 |
+
inputs=[model, index],
|
| 88 |
+
outputs=[prompt_lighteval, response_lighteval, judgement_prompt_lighteval, judgement_lighteval, score_lighteval,],
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
plotly
|
utils.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
from pprint import pprint
|
| 6 |
+
pd.options.plotting.backend = "plotly"
|
| 7 |
+
|
| 8 |
+
MODELS = [
|
| 9 |
+
"mistralai__Mistral-7B-Instruct-v0.2",
|
| 10 |
+
# "HuggingFaceH4__zephyr-7b-beta",
|
| 11 |
+
# "meta-llama__Llama-2-7b-chat-hf",
|
| 12 |
+
# "01-ai__Yi-34B-Chat",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 16 |
+
|
| 17 |
+
score_turn = {
|
| 18 |
+
1: "multi_turn",
|
| 19 |
+
0: "single_turn",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
def get_dataframe_lighteval() -> pd.DataFrame:
|
| 23 |
+
samples = []
|
| 24 |
+
scores = []
|
| 25 |
+
for model in MODELS:
|
| 26 |
+
details_lighteval = load_dataset(
|
| 27 |
+
f"SaylorTwift/details_{model}_private",
|
| 28 |
+
"extended_mt_bench_0",
|
| 29 |
+
split="latest",
|
| 30 |
+
token=HF_TOKEN,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
for d in details_lighteval:
|
| 34 |
+
judement_prompt = d["judement_prompt"]
|
| 35 |
+
judgement = d["judgement"]
|
| 36 |
+
predictions = d["predictions"][0]
|
| 37 |
+
prompts = d["full_prompt"]
|
| 38 |
+
|
| 39 |
+
turns = []
|
| 40 |
+
for turn in range(len(predictions)):
|
| 41 |
+
if turn == 1:
|
| 42 |
+
prompt = prompts[turn].format(model_response=predictions[turn - 1])
|
| 43 |
+
else:
|
| 44 |
+
prompt = prompts[turn]
|
| 45 |
+
|
| 46 |
+
turns.append([])
|
| 47 |
+
turns[turn].append(prompt)
|
| 48 |
+
turns[turn].append(predictions[turn])
|
| 49 |
+
turns[turn].append(judement_prompt[turn])
|
| 50 |
+
turns[turn].append(judgement[turn])
|
| 51 |
+
|
| 52 |
+
for i, turn in enumerate(turns):
|
| 53 |
+
samples.append(
|
| 54 |
+
{
|
| 55 |
+
"model": model,
|
| 56 |
+
"turn": i,
|
| 57 |
+
"prompt": turn[0],
|
| 58 |
+
"response": turn[1],
|
| 59 |
+
"judgement_prompt": turn[2],
|
| 60 |
+
"judgment": turn[3],
|
| 61 |
+
"score": d["metrics"][score_turn[i]],
|
| 62 |
+
"question_id": d["specifics"]["id"],
|
| 63 |
+
}
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
dataframe_all_samples = pd.DataFrame(samples)
|
| 67 |
+
|
| 68 |
+
return dataframe_all_samples
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def construct_dataframe() -> pd.DataFrame:
|
| 74 |
+
"""
|
| 75 |
+
Construct a dataframe from the data in the data folder
|
| 76 |
+
"""
|
| 77 |
+
lighteval = get_dataframe_lighteval()
|
| 78 |
+
lighteval["model"] = lighteval["model"].apply(lambda x: x.split("__")[1])
|
| 79 |
+
lighteval = lighteval.set_index(["question_id", "turn", "model"])
|
| 80 |
+
all_samples = lighteval.reset_index()
|
| 81 |
+
all_samples = all_samples.set_index("question_id")
|
| 82 |
+
|
| 83 |
+
return all_samples.dropna()
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def create_plot(model: str, dataframe: pd.DataFrame):
|
| 87 |
+
new = dataframe[dataframe["model"] == model].dropna()
|
| 88 |
+
new = new[new["turn"] == 1]
|
| 89 |
+
new["score_lighteval"] = new["score_lighteval"].astype(int)
|
| 90 |
+
new["score_mt_bench"] = new["score_mt_bench"].astype(int)
|
| 91 |
+
new = new[['score_lighteval', 'score_mt_bench']]
|
| 92 |
+
new.index = new.index.astype(str)
|
| 93 |
+
|
| 94 |
+
fig = new.plot.bar(title="Scores", labels={"index": "Index", "value": "Score"}, barmode="group")
|
| 95 |
+
|
| 96 |
+
return fig
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def get_scores(dataframe):
|
| 100 |
+
dataframe = dataframe.dropna()
|
| 101 |
+
dataframe["score"] = dataframe["score"].astype(int)
|
| 102 |
+
new = dataframe[['score', "turn", "model"]]
|
| 103 |
+
new = new.groupby(["model", "turn"]).mean()
|
| 104 |
+
new = new.groupby(["model"]).mean()
|
| 105 |
+
return new
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
df = construct_dataframe()
|
| 109 |
+
from pprint import pprint
|
| 110 |
+
pprint(df)
|
| 111 |
+
#print(df.iloc[130])
|
| 112 |
+
# model = "zephyr-7b-beta"
|
| 113 |
+
# fig = create_plot(model, df)
|
| 114 |
+
# fig.show()
|