Spaces:
Running
Running
| import os, traceback | |
| import gradio as gr | |
| from agent import MyAgent | |
| from api import get_questions, submit | |
| from secret import check_secret, USERNAME | |
| from verifier import Verifier | |
| def run_app(submission_password: str, model_name: str): | |
| chat_history = [] | |
| try: | |
| # Get the list of questions from the API | |
| questions_data = get_questions() | |
| if type(questions_data) == str: | |
| err_msg = f"An error occurred: *{str(questions_data)}*" | |
| chat_history.append(gr.ChatMessage(err_msg, role="assistant")) | |
| yield chat_history | |
| return | |
| # Initialize the agent | |
| agent = MyAgent(model_name=model_name) | |
| verifier = Verifier() | |
| # Loop through the questions and simulate the chat | |
| answers_payload = [] | |
| for i, item in enumerate(questions_data): | |
| task_id = item.get("task_id") | |
| question_text = item.get("question") | |
| chat_history.append(gr.ChatMessage(f"**Question {i+1}:** {question_text}", role="user")) | |
| yield chat_history | |
| print(f"Asking agent question {i+1}...") | |
| response = agent.ask(question_text, final_answer_only=False) | |
| print(f"Agent responses: {response}") | |
| answer = agent.format_output(response) | |
| print("Final answer:", answer) | |
| verifier_response = verifier.verify(task_id, answer) | |
| if verifier_response is not None: | |
| is_correct, correct_answer = verifier_response | |
| if is_correct: | |
| print(f"**Correct!**\n{answer}") | |
| else: | |
| print(f"**Incorrect!** - Correct answer: {correct_answer}") | |
| chat_history.append(gr.ChatMessage(answer, role="assistant")) | |
| yield chat_history | |
| # Append to the answers payload | |
| answers_payload.append({ | |
| "task_id": task_id, | |
| "submitted_answer": answer | |
| }) | |
| # Check required to submit answers | |
| final_status = "" | |
| if answers_payload and submission_password: | |
| if check_secret(submission_password): | |
| username = USERNAME | |
| agent_code = f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main" | |
| answers_payload = { | |
| "username": username, | |
| "answers": answers_payload, | |
| "agent_code": agent_code | |
| } | |
| final_status = submit(answers_payload) | |
| print(final_status.strip()) | |
| else: | |
| print("Wrong submission password") | |
| chat_history.append(gr.ChatMessage(verifier.get_output(), role="user")) | |
| yield chat_history | |
| except Exception as e: | |
| traceback.print_exception(e) | |
| err_msg = f"An error occurred: *{str(e)}*" | |
| chat_history.append(gr.ChatMessage(err_msg, role="assistant")) | |
| yield chat_history | |
| return | |
| def get_ui(): | |
| with gr.Blocks() as ui_block: | |
| gr.Markdown("# AI Agent Evaluation Runner π΅π»ββοΈ") | |
| gr.Markdown( | |
| """ | |
| Evaluate an AI agent on a subset of validation questions from the [**General AI Assistants (GAIA) Benchmark**](https://arxiv.org/abs/2311.12983). | |
| **Note**: This space run on minimal setup and takes time to answer the questions, the agent will report only the final answer. | |
| [API Information](https://huggingface.co/learn/agents-course/unit4/hands-on) | |
| """ | |
| ) | |
| chatbot = gr.Chatbot( | |
| label="AI Assistant", | |
| type="messages", | |
| avatar_images=( | |
| None, | |
| "images/chatbot.png", | |
| ), | |
| resizable=True, | |
| scale=1 | |
| ) | |
| run_button = gr.Button("Run Evaluation") | |
| model_selection = gr.Dropdown( | |
| label="Model", | |
| choices=[ | |
| "gemini-2.5-flash-preview-04-17", | |
| "gemini-2.0-flash", | |
| "gemini-1.5-flash", | |
| "gemma-3-12b-it", | |
| "gemma-3-27b-it" | |
| ], | |
| value="gemini-2.0-flash", | |
| visible=False | |
| ) | |
| subm_pw = gr.Textbox("", type="password", placeholder="Submission Password", label="Submission Password") | |
| run_button.click( | |
| fn=run_app, | |
| inputs=[subm_pw, model_selection], | |
| outputs=[chatbot] | |
| ) | |
| return ui_block | |
| if __name__ == "__main__": | |
| from main import main as main_fn | |
| main_fn() |