gaia-eval-l1-20 / app.py
kengboon
fix
a03972d
raw
history blame
4.59 kB
import os, traceback
import gradio as gr
from agent import MyAgent
from api import get_questions, submit
from secret import check_secret, USERNAME
from verifier import Verifier
def run_app(submission_password: str, model_name: str):
chat_history = []
try:
# Get the list of questions from the API
questions_data = get_questions()
if type(questions_data) == str:
err_msg = f"An error occurred: *{str(questions_data)}*"
chat_history.append(gr.ChatMessage(err_msg, role="assistant"))
yield chat_history
return
# Initialize the agent
agent = MyAgent(model_name=model_name)
verifier = Verifier()
# Loop through the questions and simulate the chat
answers_payload = []
for i, item in enumerate(questions_data):
task_id = item.get("task_id")
question_text = item.get("question")
chat_history.append(gr.ChatMessage(f"**Question {i+1}:** {question_text}", role="user"))
yield chat_history
print(f"Asking agent question {i+1}...")
response = agent.ask(question_text, final_answer_only=False)
print(f"Agent responses: {response}")
answer = agent.format_output(response)
print("Final answer:", answer)
verifier_response = verifier.verify(task_id, answer)
if verifier_response is not None:
is_correct, correct_answer = verifier_response
if is_correct:
print(f"**Correct!**\n{answer}")
else:
print(f"**Incorrect!** - Correct answer: {correct_answer}")
chat_history.append(gr.ChatMessage(answer, role="assistant"))
yield chat_history
# Append to the answers payload
answers_payload.append({
"task_id": task_id,
"submitted_answer": answer
})
# Check required to submit answers
final_status = ""
if answers_payload and submission_password:
if check_secret(submission_password):
username = USERNAME
agent_code = f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main"
answers_payload = {
"username": username,
"answers": answers_payload,
"agent_code": agent_code
}
final_status = submit(answers_payload)
print(final_status.strip())
else:
print("Wrong submission password")
chat_history.append(gr.ChatMessage(verifier.get_output(), role="user"))
yield chat_history
except Exception as e:
traceback.print_exception(e)
err_msg = f"An error occurred: *{str(e)}*"
chat_history.append(gr.ChatMessage(err_msg, role="assistant"))
yield chat_history
return
def get_ui():
with gr.Blocks() as ui_block:
gr.Markdown("# AI Agent Evaluation Runner πŸ•΅πŸ»β€β™‚οΈ")
gr.Markdown(
"""
Evaluate an AI agent on a subset of validation questions from the [**General AI Assistants (GAIA) Benchmark**](https://arxiv.org/abs/2311.12983).
**Note**: This space run on minimal setup and takes time to answer the questions, the agent will report only the final answer.
[API Information](https://huggingface.co/learn/agents-course/unit4/hands-on)
"""
)
chatbot = gr.Chatbot(
label="AI Assistant",
type="messages",
avatar_images=(
None,
"images/chatbot.png",
),
resizable=True,
scale=1
)
run_button = gr.Button("Run Evaluation")
model_selection = gr.Dropdown(
label="Model",
choices=[
"gemini-2.5-flash-preview-04-17",
"gemini-2.0-flash",
"gemini-1.5-flash",
"gemma-3-12b-it",
"gemma-3-27b-it"
],
value="gemini-2.0-flash",
visible=False
)
subm_pw = gr.Textbox("", type="password", placeholder="Submission Password", label="Submission Password")
run_button.click(
fn=run_app,
inputs=[subm_pw, model_selection],
outputs=[chatbot]
)
return ui_block
if __name__ == "__main__":
from main import main as main_fn
main_fn()