gaia-eval-l1-20

Running

gaia-eval-l1-20 / app.py

kengboon

fix

a03972d 6 months ago

4.59 kB

	import os, traceback

	import gradio as gr

	from agent import MyAgent
	from api import get_questions, submit
	from secret import check_secret, USERNAME
	from verifier import Verifier

	def run_app(submission_password: str, model_name: str):
	chat_history = []

	try:
	# Get the list of questions from the API
	questions_data = get_questions()
	if type(questions_data) == str:
	err_msg = f"An error occurred: {str(questions_data)}"
	chat_history.append(gr.ChatMessage(err_msg, role="assistant"))
	yield chat_history
	return

	# Initialize the agent
	agent = MyAgent(model_name=model_name)
	verifier = Verifier()

	# Loop through the questions and simulate the chat
	answers_payload = []
	for i, item in enumerate(questions_data):
	task_id = item.get("task_id")
	question_text = item.get("question")

	chat_history.append(gr.ChatMessage(f"Question {i+1}: {question_text}", role="user"))
	yield chat_history

	print(f"Asking agent question {i+1}...")
	response = agent.ask(question_text, final_answer_only=False)
	print(f"Agent responses: {response}")

	answer = agent.format_output(response)
	print("Final answer:", answer)
	verifier_response = verifier.verify(task_id, answer)
	if verifier_response is not None:
	is_correct, correct_answer = verifier_response
	if is_correct:
	print(f"Correct!\n{answer}")
	else:
	print(f"Incorrect! - Correct answer: {correct_answer}")

	chat_history.append(gr.ChatMessage(answer, role="assistant"))
	yield chat_history

	# Append to the answers payload
	answers_payload.append({
	"task_id": task_id,
	"submitted_answer": answer
	})

	# Check required to submit answers
	final_status = ""
	if answers_payload and submission_password:
	if check_secret(submission_password):
	username = USERNAME
	agent_code = f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main"
	answers_payload = {
	"username": username,
	"answers": answers_payload,
	"agent_code": agent_code
	}
	final_status = submit(answers_payload)
	print(final_status.strip())
	else:
	print("Wrong submission password")

	chat_history.append(gr.ChatMessage(verifier.get_output(), role="user"))
	yield chat_history
	except Exception as e:
	traceback.print_exception(e)
	err_msg = f"An error occurred: {str(e)}"
	chat_history.append(gr.ChatMessage(err_msg, role="assistant"))
	yield chat_history
	return

	def get_ui():
	with gr.Blocks() as ui_block:
	gr.Markdown("# AI Agent Evaluation Runner 🕵🏻‍♂️")
	gr.Markdown(
	"""
	Evaluate an AI agent on a subset of validation questions from the [General AI Assistants (GAIA) Benchmark](https://arxiv.org/abs/2311.12983).

	Note: This space run on minimal setup and takes time to answer the questions, the agent will report only the final answer.

	[API Information](https://huggingface.co/learn/agents-course/unit4/hands-on)
	"""
	)
	chatbot = gr.Chatbot(
	label="AI Assistant",
	type="messages",
	avatar_images=(
	None,
	"images/chatbot.png",
	),
	resizable=True,
	scale=1
	)
	run_button = gr.Button("Run Evaluation")
	model_selection = gr.Dropdown(
	label="Model",
	choices=[
	"gemini-2.5-flash-preview-04-17",
	"gemini-2.0-flash",
	"gemini-1.5-flash",
	"gemma-3-12b-it",
	"gemma-3-27b-it"
	],
	value="gemini-2.0-flash",
	visible=False
	)
	subm_pw = gr.Textbox("", type="password", placeholder="Submission Password", label="Submission Password")

	run_button.click(
	fn=run_app,
	inputs=[subm_pw, model_selection],
	outputs=[chatbot]
	)
	return ui_block

	if __name__ == "__main__":
	from main import main as main_fn
	main_fn()