import os, traceback import gradio as gr from agent import MyAgent from api import get_questions, submit from secret import check_secret, USERNAME from verifier import Verifier def run_app(submission_password: str, model_name: str): chat_history = [] try: # Get the list of questions from the API questions_data = get_questions() if type(questions_data) == str: err_msg = f"An error occurred: *{str(questions_data)}*" chat_history.append(gr.ChatMessage(err_msg, role="assistant")) yield chat_history return # Initialize the agent agent = MyAgent(model_name=model_name) verifier = Verifier() # Loop through the questions and simulate the chat answers_payload = [] for i, item in enumerate(questions_data): task_id = item.get("task_id") question_text = item.get("question") chat_history.append(gr.ChatMessage(f"**Question {i+1}:** {question_text}", role="user")) yield chat_history print(f"Asking agent question {i+1}...") response = agent.ask(question_text, final_answer_only=False) print(f"Agent responses: {response}") answer = agent.format_output(response) print("Final answer:", answer) verifier_response = verifier.verify(task_id, answer) if verifier_response is not None: is_correct, correct_answer = verifier_response if is_correct: print(f"**Correct!**\n{answer}") else: print(f"**Incorrect!** - Correct answer: {correct_answer}") chat_history.append(gr.ChatMessage(answer, role="assistant")) yield chat_history # Append to the answers payload answers_payload.append({ "task_id": task_id, "submitted_answer": answer }) # Check required to submit answers final_status = "" if answers_payload and submission_password: if check_secret(submission_password): username = USERNAME agent_code = f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main" answers_payload = { "username": username, "answers": answers_payload, "agent_code": agent_code } final_status = submit(answers_payload) print(final_status.strip()) else: print("Wrong submission password") chat_history.append(gr.ChatMessage(verifier.get_output(), role="user")) yield chat_history except Exception as e: traceback.print_exception(e) err_msg = f"An error occurred: *{str(e)}*" chat_history.append(gr.ChatMessage(err_msg, role="assistant")) yield chat_history return def get_ui(): with gr.Blocks() as ui_block: gr.Markdown("# AI Agent Evaluation Runner 🕵🏻‍♂️") gr.Markdown( """ Evaluate an AI agent on a subset of validation questions from the [**General AI Assistants (GAIA) Benchmark**](https://arxiv.org/abs/2311.12983). **Note**: This space run on minimal setup and takes time to answer the questions, the agent will report only the final answer. [API Information](https://huggingface.co/learn/agents-course/unit4/hands-on) """ ) chatbot = gr.Chatbot( label="AI Assistant", type="messages", avatar_images=( None, "images/chatbot.png", ), resizable=True, scale=1 ) run_button = gr.Button("Run Evaluation") model_selection = gr.Dropdown( label="Model", choices=[ "gemini-2.5-flash-preview-04-17", "gemini-2.0-flash", "gemini-1.5-flash", "gemma-3-12b-it", "gemma-3-27b-it" ], value="gemini-2.0-flash", visible=False ) subm_pw = gr.Textbox("", type="password", placeholder="Submission Password", label="Submission Password") run_button.click( fn=run_app, inputs=[subm_pw, model_selection], outputs=[chatbot] ) return ui_block if __name__ == "__main__": from main import main as main_fn main_fn()