Working agent with saving after task'
Browse files- app.py +18 -17
- e2bqwen.py +127 -49
app.py
CHANGED
|
@@ -387,8 +387,7 @@ def get_or_create_sandbox(session_hash):
|
|
| 387 |
print(f"Creating new sandbox for session {session_hash}")
|
| 388 |
desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
|
| 389 |
desktop.stream.start(require_auth=True)
|
| 390 |
-
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}'
|
| 391 |
-
sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
| 392 |
desktop.commands.run(setup_cmd)
|
| 393 |
|
| 394 |
# Store sandbox with metadata
|
|
@@ -486,6 +485,7 @@ class EnrichedGradioUI(GradioUI):
|
|
| 486 |
text_input,
|
| 487 |
gr.Button(interactive=False),
|
| 488 |
)
|
|
|
|
| 489 |
def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
|
| 490 |
import gradio as gr
|
| 491 |
|
|
@@ -517,24 +517,24 @@ class EnrichedGradioUI(GradioUI):
|
|
| 517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
| 518 |
""")
|
| 519 |
|
| 520 |
-
try:
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
| 525 |
-
stored_messages.append(msg)
|
| 526 |
-
yield stored_messages
|
| 527 |
|
|
|
|
|
|
|
| 528 |
yield stored_messages
|
| 529 |
-
save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
|
| 530 |
-
except Exception as e:
|
| 531 |
-
error_message=f"Error in interaction: {str(e)}"
|
| 532 |
-
stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
|
| 533 |
-
yield stored_messages
|
| 534 |
-
save_final_status(data_dir, "failed", details = str(error_message))
|
| 535 |
|
| 536 |
-
|
| 537 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
| 540 |
|
|
@@ -568,6 +568,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js, fill_width=True) as de
|
|
| 568 |
"Write 'Hello World' in a text editor",
|
| 569 |
"Search a flight Paris - Berlin for tomorrow",
|
| 570 |
"Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
|
|
|
|
| 571 |
],
|
| 572 |
inputs = task_input,
|
| 573 |
label= "Example Tasks",
|
|
|
|
| 387 |
print(f"Creating new sandbox for session {session_hash}")
|
| 388 |
desktop = Sandbox(api_key=E2B_API_KEY, resolution=(WIDTH, HEIGHT), dpi=96, timeout=SANDBOX_TIMEOUT)
|
| 389 |
desktop.stream.start(require_auth=True)
|
| 390 |
+
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
|
|
|
| 391 |
desktop.commands.run(setup_cmd)
|
| 392 |
|
| 393 |
# Store sandbox with metadata
|
|
|
|
| 485 |
text_input,
|
| 486 |
gr.Button(interactive=False),
|
| 487 |
)
|
| 488 |
+
|
| 489 |
def interact_with_agent(self, task_input, stored_messages, session_state, session_hash, request: gr.Request):
|
| 490 |
import gradio as gr
|
| 491 |
|
|
|
|
| 517 |
We can only execute one action at a time. On each step, answer only a python blob with the action to perform
|
| 518 |
""")
|
| 519 |
|
| 520 |
+
# try:
|
| 521 |
+
stored_messages.append(gr.ChatMessage(role="user", content=task_input))
|
| 522 |
+
yield stored_messages
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
|
| 524 |
+
for msg in stream_to_gradio(session_state["agent"], task=full_task, reset_agent_memory=False):
|
| 525 |
+
stored_messages.append(msg)
|
| 526 |
yield stored_messages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
+
yield stored_messages
|
| 529 |
+
# save_final_status(data_dir, "completed", details = str(session_state["agent"].memory.get_succinct_steps()))
|
| 530 |
+
# except Exception as e:
|
| 531 |
+
# error_message=f"Error in interaction: {str(e)}"
|
| 532 |
+
# stored_messages.append(gr.ChatMessage(role="assistant", content=error_message))
|
| 533 |
+
# yield stored_messages
|
| 534 |
+
# save_final_status(data_dir, "failed", details = str(error_message))
|
| 535 |
+
|
| 536 |
+
# finally:
|
| 537 |
+
# upload_to_hf_and_remove(data_dir)
|
| 538 |
|
| 539 |
theme = gr.themes.Default(font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue")
|
| 540 |
|
|
|
|
| 568 |
"Write 'Hello World' in a text editor",
|
| 569 |
"Search a flight Paris - Berlin for tomorrow",
|
| 570 |
"Could you head to Fontainebleau (France) in Google Maps then drag and drop to position the castle of Fontainebleau exactly in the center?",
|
| 571 |
+
"Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background"
|
| 572 |
],
|
| 573 |
inputs = task_input,
|
| 574 |
label= "Example Tasks",
|
e2bqwen.py
CHANGED
|
@@ -135,9 +135,11 @@ class E2BVisionAgent(CodeAgent):
|
|
| 135 |
# Add default tools
|
| 136 |
self._setup_desktop_tools()
|
| 137 |
self.logger.log("Setting up agent tools...")
|
| 138 |
-
self.step_callbacks.append(self.
|
| 139 |
self.logger.log("Studying an action plan... that will take a bit.")
|
| 140 |
|
|
|
|
|
|
|
| 141 |
def _setup_desktop_tools(self):
|
| 142 |
"""Register all desktop tools"""
|
| 143 |
@tool
|
|
@@ -296,55 +298,50 @@ class E2BVisionAgent(CodeAgent):
|
|
| 296 |
self.tools["drag_and_drop"] = drag_and_drop
|
| 297 |
|
| 298 |
|
| 299 |
-
def store_metadata_to_file(self,
|
| 300 |
metadata_path = os.path.join(self.data_dir, "metadata.json")
|
| 301 |
output = {}
|
| 302 |
-
|
|
|
|
|
|
|
|
|
|
| 303 |
a = open(metadata_path,"w")
|
| 304 |
-
a.write(json.dumps(
|
| 305 |
a.close()
|
|
|
|
| 306 |
|
| 307 |
|
| 308 |
-
def
|
| 309 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
| 310 |
self.logger.log(self.log_path, "Analyzing screen content...")
|
| 311 |
|
| 312 |
current_step = memory_step.step_number
|
| 313 |
print(f"Taking screenshot for step {current_step}")
|
| 314 |
-
# Check if desktop is still running
|
| 315 |
-
if not self.desktop.is_running():
|
| 316 |
-
print("Desktop is no longer running. Terminating agent.")
|
| 317 |
-
self.close()
|
| 318 |
-
# Add a final observation indicating why the agent was terminated
|
| 319 |
-
memory_step.observations = "Desktop session ended. Agent terminated."
|
| 320 |
-
# Store final metadata before exiting
|
| 321 |
-
self.store_metadata_to_file(agent)
|
| 322 |
-
return # Exit the callback without attempting to take a screenshot
|
| 323 |
-
|
| 324 |
-
try:
|
| 325 |
-
time.sleep(2.0) # Let things happen on the desktop
|
| 326 |
-
screenshot_bytes = self.desktop.screenshot()
|
| 327 |
-
image = Image.open(BytesIO(screenshot_bytes))
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
print(f"Saved screenshot to {screenshot_path}")
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
|
|
|
| 337 |
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
|
| 346 |
-
|
| 347 |
-
|
|
|
|
| 348 |
|
| 349 |
def close(self):
|
| 350 |
"""Clean up resources"""
|
|
@@ -356,6 +353,87 @@ class E2BVisionAgent(CodeAgent):
|
|
| 356 |
self.desktop.kill()
|
| 357 |
print("E2B sandbox terminated")
|
| 358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
class QwenVLAPIModel(Model):
|
| 361 |
"""Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
|
@@ -401,18 +479,18 @@ class QwenVLAPIModel(Model):
|
|
| 401 |
# Format messages once for both APIs
|
| 402 |
formatted_messages = self._format_messages(messages)
|
| 403 |
|
| 404 |
-
# First try the HF endpoint if available
|
| 405 |
-
if self.hf_client:
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
|
| 417 |
# Fallback to hyperbolic
|
| 418 |
try:
|
|
@@ -442,7 +520,6 @@ class QwenVLAPIModel(Model):
|
|
| 442 |
else:
|
| 443 |
# Image is a PIL image or similar object
|
| 444 |
img_byte_arr = BytesIO()
|
| 445 |
-
item["image"].save(img_byte_arr, format="PNG")
|
| 446 |
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
| 447 |
|
| 448 |
content.append({
|
|
@@ -463,7 +540,7 @@ class QwenVLAPIModel(Model):
|
|
| 463 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
| 464 |
|
| 465 |
# Extract parameters with defaults
|
| 466 |
-
max_tokens = kwargs.get("max_new_tokens",
|
| 467 |
temperature = kwargs.get("temperature", 0.7)
|
| 468 |
top_p = kwargs.get("top_p", 0.9)
|
| 469 |
stream = kwargs.get("stream", False)
|
|
@@ -494,9 +571,10 @@ class QwenVLAPIModel(Model):
|
|
| 494 |
completion = self.hyperbolic_client.chat.completions.create(
|
| 495 |
model=self.model_path,
|
| 496 |
messages=formatted_messages,
|
| 497 |
-
max_tokens=kwargs.get("max_new_tokens",
|
| 498 |
temperature=kwargs.get("temperature", 0.7),
|
| 499 |
top_p=kwargs.get("top_p", 0.9),
|
|
|
|
| 500 |
)
|
| 501 |
|
| 502 |
# Extract the response text
|
|
|
|
| 135 |
# Add default tools
|
| 136 |
self._setup_desktop_tools()
|
| 137 |
self.logger.log("Setting up agent tools...")
|
| 138 |
+
self.step_callbacks.append(self.take_screenshot_callback)
|
| 139 |
self.logger.log("Studying an action plan... that will take a bit.")
|
| 140 |
|
| 141 |
+
self.final_answer_checks = [self.store_metadata_to_file]
|
| 142 |
+
|
| 143 |
def _setup_desktop_tools(self):
|
| 144 |
"""Register all desktop tools"""
|
| 145 |
@tool
|
|
|
|
| 298 |
self.tools["drag_and_drop"] = drag_and_drop
|
| 299 |
|
| 300 |
|
| 301 |
+
def store_metadata_to_file(self, final_answer, memory) -> None:
|
| 302 |
metadata_path = os.path.join(self.data_dir, "metadata.json")
|
| 303 |
output = {}
|
| 304 |
+
# THIS ERASES IMAGES FROM MEMORY, USE WITH CAUTION
|
| 305 |
+
for memory_step in self.memory.steps:
|
| 306 |
+
if getattr(memory_step, "observations_images", None):
|
| 307 |
+
memory_step.observations_images = None
|
| 308 |
a = open(metadata_path,"w")
|
| 309 |
+
a.write(json.dumps(self.write_memory_to_messages()))
|
| 310 |
a.close()
|
| 311 |
+
return True
|
| 312 |
|
| 313 |
|
| 314 |
+
def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
|
| 315 |
"""Callback that takes a screenshot + memory snapshot after a step completes"""
|
| 316 |
self.logger.log(self.log_path, "Analyzing screen content...")
|
| 317 |
|
| 318 |
current_step = memory_step.step_number
|
| 319 |
print(f"Taking screenshot for step {current_step}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
+
time.sleep(2.0) # Let things happen on the desktop
|
| 322 |
+
screenshot_bytes = self.desktop.screenshot()
|
| 323 |
+
image = Image.open(BytesIO(screenshot_bytes))
|
|
|
|
| 324 |
|
| 325 |
+
# Create a filename with step number
|
| 326 |
+
screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
|
| 327 |
+
image.save(screenshot_path)
|
| 328 |
+
print(f"Saved screenshot to {screenshot_path}")
|
| 329 |
|
| 330 |
+
for (
|
| 331 |
+
previous_memory_step
|
| 332 |
+
) in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
| 333 |
+
if (
|
| 334 |
+
isinstance(previous_memory_step, ActionStep)
|
| 335 |
+
and previous_memory_step.step_number <= current_step - 2
|
| 336 |
+
):
|
| 337 |
+
previous_memory_step.observations_images = None
|
| 338 |
|
| 339 |
+
# Add to the current memory step
|
| 340 |
+
memory_step.observations_images = [image.copy()] # This takes the original image directly.
|
|
|
|
| 341 |
|
| 342 |
+
# memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
|
| 343 |
+
|
| 344 |
+
|
| 345 |
|
| 346 |
def close(self):
|
| 347 |
"""Clean up resources"""
|
|
|
|
| 353 |
self.desktop.kill()
|
| 354 |
print("E2B sandbox terminated")
|
| 355 |
|
| 356 |
+
from smolagents import HfApiModel
|
| 357 |
+
|
| 358 |
+
# class QwenVLAPIModel(Model):
|
| 359 |
+
# """Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
| 360 |
+
|
| 361 |
+
# def __init__(
|
| 362 |
+
# self,
|
| 363 |
+
# model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
|
| 364 |
+
# provider: str = "hyperbolic",
|
| 365 |
+
# hf_token: str = None,
|
| 366 |
+
# hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud"
|
| 367 |
+
# ):
|
| 368 |
+
# super().__init__()
|
| 369 |
+
# self.model_id = model_path
|
| 370 |
+
# self.hf_base_url = hf_base_url
|
| 371 |
+
# self.dedicated_endpoint_model = HfApiModel(
|
| 372 |
+
# hf_base_url,
|
| 373 |
+
# token=hf_token
|
| 374 |
+
# )
|
| 375 |
+
# self.fallback_model = HfApiModel(
|
| 376 |
+
# model_path,
|
| 377 |
+
# provider=provider,
|
| 378 |
+
# token=hf_token,
|
| 379 |
+
# )
|
| 380 |
+
|
| 381 |
+
# def __call__(
|
| 382 |
+
# self,
|
| 383 |
+
# messages: List[Dict[str, Any]],
|
| 384 |
+
# stop_sequences: Optional[List[str]] = None,
|
| 385 |
+
# **kwargs
|
| 386 |
+
# ) -> ChatMessage:
|
| 387 |
+
|
| 388 |
+
# try:
|
| 389 |
+
# return self.dedicated_endpoint_model(messages, stop_sequences, **kwargs)
|
| 390 |
+
# except Exception as e:
|
| 391 |
+
# print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
|
| 392 |
+
|
| 393 |
+
# # Continue to fallback
|
| 394 |
+
# try:
|
| 395 |
+
# return self.fallback_model(messages, stop_sequences, **kwargs)
|
| 396 |
+
# except Exception as e:
|
| 397 |
+
# raise Exception(f"Both endpoints failed. Last error: {e}")
|
| 398 |
+
|
| 399 |
+
# def _format_messages(self, messages: List[Dict[str, Any]]):
|
| 400 |
+
# """Format messages for API requests - works for both endpoints"""
|
| 401 |
+
|
| 402 |
+
# formatted_messages = []
|
| 403 |
+
|
| 404 |
+
# for msg in messages:
|
| 405 |
+
# role = msg["role"]
|
| 406 |
+
# content = []
|
| 407 |
+
|
| 408 |
+
# if isinstance(msg["content"], list):
|
| 409 |
+
# for item in msg["content"]:
|
| 410 |
+
# if item["type"] == "text":
|
| 411 |
+
# content.append({"type": "text", "text": item["text"]})
|
| 412 |
+
# elif item["type"] == "image":
|
| 413 |
+
# # Handle image path or direct image object
|
| 414 |
+
# if isinstance(item["image"], str):
|
| 415 |
+
# # Image is a path
|
| 416 |
+
# with open(item["image"], "rb") as image_file:
|
| 417 |
+
# base64_image = base64.b64encode(image_file.read()).decode("utf-8")
|
| 418 |
+
# else:
|
| 419 |
+
# # Image is a PIL image or similar object
|
| 420 |
+
# img_byte_arr = BytesIO()
|
| 421 |
+
# item["image"].save(img_byte_arr, format="PNG")
|
| 422 |
+
# base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
| 423 |
+
|
| 424 |
+
# content.append({
|
| 425 |
+
# "type": "image_url",
|
| 426 |
+
# "image_url": {
|
| 427 |
+
# "url": f"data:image/png;base64,{base64_image}"
|
| 428 |
+
# }
|
| 429 |
+
# })
|
| 430 |
+
# else:
|
| 431 |
+
# # Plain text message
|
| 432 |
+
# content = [{"type": "text", "text": msg["content"]}]
|
| 433 |
+
|
| 434 |
+
# formatted_messages.append({"role": role, "content": content})
|
| 435 |
+
|
| 436 |
+
# return formatted_messages
|
| 437 |
|
| 438 |
class QwenVLAPIModel(Model):
|
| 439 |
"""Model wrapper for Qwen2.5VL API with fallback mechanism"""
|
|
|
|
| 479 |
# Format messages once for both APIs
|
| 480 |
formatted_messages = self._format_messages(messages)
|
| 481 |
|
| 482 |
+
# First try the HF endpoint if available - THIS ALWAYS FAILS SO SKIPPING
|
| 483 |
+
# if self.hf_client:
|
| 484 |
+
# try:
|
| 485 |
+
# completion = self._call_hf_endpoint(
|
| 486 |
+
# formatted_messages,
|
| 487 |
+
# stop_sequences,
|
| 488 |
+
# **kwargs
|
| 489 |
+
# )
|
| 490 |
+
# return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
|
| 491 |
+
# except Exception as e:
|
| 492 |
+
# print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
|
| 493 |
+
# # Continue to fallback
|
| 494 |
|
| 495 |
# Fallback to hyperbolic
|
| 496 |
try:
|
|
|
|
| 520 |
else:
|
| 521 |
# Image is a PIL image or similar object
|
| 522 |
img_byte_arr = BytesIO()
|
|
|
|
| 523 |
base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
| 524 |
|
| 525 |
content.append({
|
|
|
|
| 540 |
"""Call the Hugging Face OpenAI-compatible endpoint"""
|
| 541 |
|
| 542 |
# Extract parameters with defaults
|
| 543 |
+
max_tokens = kwargs.get("max_new_tokens", 1024)
|
| 544 |
temperature = kwargs.get("temperature", 0.7)
|
| 545 |
top_p = kwargs.get("top_p", 0.9)
|
| 546 |
stream = kwargs.get("stream", False)
|
|
|
|
| 571 |
completion = self.hyperbolic_client.chat.completions.create(
|
| 572 |
model=self.model_path,
|
| 573 |
messages=formatted_messages,
|
| 574 |
+
max_tokens=kwargs.get("max_new_tokens", 1024),
|
| 575 |
temperature=kwargs.get("temperature", 0.7),
|
| 576 |
top_p=kwargs.get("top_p", 0.9),
|
| 577 |
+
stop=stop_sequences
|
| 578 |
)
|
| 579 |
|
| 580 |
# Extract the response text
|