Agents_Course_Final_Assignment

Sleeping

App Files Files Community

Gary Simmons commited on 30 days ago

Commit

5557a0e

1 Parent(s): 13127ce

enhance RateLimitedModel with retry logic moved to model and utilizing configurable parameters

Browse files

Files changed (1) hide show

app.py +83 -67

app.py CHANGED Viewed

@@ -61,10 +61,19 @@ class TokenBucketRateLimiter:
 class RateLimitedModel:
-    """Wraps a model-like callable and enforces a TokenBucketRateLimiter before each call."""
-    def __init__(self, model_obj, rpm: int = 8, burst: int | None = None):
         self._model = model_obj
         # rpm -> tokens per minute
         capacity = burst if burst is not None else max(1, rpm)
         refill_rate = float(rpm) / 60.0
@@ -72,24 +81,65 @@ class RateLimitedModel:
             capacity=capacity, refill_rate=refill_rate
         )
     def __call__(self, *args, **kwargs):
-        # Each high-level call consumes 1 token. Internal model calls
-        # inside smolagents may still produce multiple requests; this
-        # aims to protect against too many top-level calls per-minute.
-        wait = self._limiter.acquire(1.0)
-        if wait > 0:
-            # Small jitter to avoid stampede
-            jitter = random.uniform(0.0, 0.5)
-            total_wait = wait + jitter
-            print(f"RateLimitedModel sleeping {total_wait:.2f}s to respect RPM limit")
-            time.sleep(total_wait)
-        return self._model(*args, **kwargs)
     def __getattr__(self, name: str):
         """Proxy attribute access to the underlying model.
         For callable attributes (like `generate`) we wrap the call so the
-        token-bucket rate limiter is applied consistently.
         """
         # Avoid recursion
         if name.startswith("_"):
@@ -100,15 +150,7 @@ class RateLimitedModel:
         if callable(attr):
             def wrapped(*args, **kwargs):
-                wait = self._limiter.acquire(1.0)
-                if wait > 0:
-                    jitter = random.uniform(0.0, 0.5)
-                    total_wait = wait + jitter
-                    print(
-                        f"RateLimitedModel sleeping {total_wait:.2f}s to respect RPM limit"
-                    )
-                    time.sleep(total_wait)
-                return attr(*args, **kwargs)
             # Preserve original metadata where possible
             try:
@@ -120,14 +162,19 @@ class RateLimitedModel:
         return attr
-# Wrap the model with a rate-limiter. Default RPM is reduced to 8
 # but can be configured via the MODEL_RPM environment variable.
 _configured_rpm = int(os.getenv("MODEL_RPM", "8"))
 _configured_burst = None
 model = RateLimitedModel(
     LiteLLMModel(model_id="gemini/gemini-2.5-flash", temperature=0.2),
     rpm=_configured_rpm,
     burst=_configured_burst,
 )
@@ -183,56 +230,25 @@ class BasicAgent:
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
-            # Use retry wrapper to handle rate limit (429) errors from the model
-            # The CodeAgent is callable and expects a task string as first argument
-            # Pass it through the `call_model_with_retry` helper so RateLimitError is
-            # handled with exponential backoff.
-            response = call_model_with_retry(self.code_agent, question)
             print(f"Agent returning response: {response}")
             return response
         except Exception as e:
-            print(f"Error in code agent: {e}")
             return f"AGENT ERROR: {e}"
-# Note: The backoff_multiplier was changed from 3 to 4, which increases the delay between retries exponentially.
-# This means that after each failed attempt, the wait time before the next retry will grow more rapidly,
-# potentially resulting in significantly longer total retry durations.
-def call_model_with_retry(
-    callable_fn,
-    *args,
-    max_retries=15,
-    initial_delay=10.0,
-    rate_limit_delay=60.0,
-    **kwargs,
-):
-    """
-    Calls a function with retry logic and a configurable wait on rate-limit errors.
-    Retries up to `max_retries` times, waiting `rate_limit_delay` seconds between attempts after a RateLimitError.
-    """
-    delay = initial_delay
-    # Initial delay before the first attempt in case a very recent call occurred
-    time.sleep(delay)
-    for attempt in range(1, max_retries + 1):
-        try:
-            print(f"Attempt {attempt} of {max_retries}...")
-            return callable_fn(*args, **kwargs)
-        except RateLimitError as e:
-            # If we've exhausted retries, re-raise
-            print(f"RateLimitError on attempt {attempt}: {e}")
-            print(f"max_retries={max_retries}, attempt={attempt}")
-            if attempt == max_retries:
-                raise
-            # Wait a configurable delay on rate-limit errors instead of recalculating
-            # using exponential backoff. This avoids growing wait times and keeps
-            # retry behavior predictable.
-            print(
-                f"RateLimitError encountered (attempt {attempt}/{max_retries}). Waiting {rate_limit_delay}s before retrying..."
-            )
-            time.sleep(rate_limit_delay)
 def run_and_submit_all(profile: gr.OAuthProfile | None):

 class RateLimitedModel:
+    """Wraps a model-like callable and enforces a TokenBucketRateLimiter before each call with retry logic."""
+    def __init__(
+        self,
+        model_obj,
+        rpm: int = 8,
+        burst: int | None = None,
+        max_retries: int = 10,
+        base_delay: float = 30.0,
+    ):
         self._model = model_obj
+        self.max_retries = max_retries
+        self.base_delay = base_delay
         # rpm -> tokens per minute
         capacity = burst if burst is not None else max(1, rpm)
         refill_rate = float(rpm) / 60.0
             capacity=capacity, refill_rate=refill_rate
         )
+    def _call_with_retry(self, func, *args, **kwargs):
+        """Call a function with retry logic for rate limit errors."""
+        last_exception = None
+        for attempt in range(1, self.max_retries + 1):
+            try:
+                # Apply rate limiting before each attempt
+                wait = self._limiter.acquire(1.0)
+                if wait > 0:
+                    jitter = random.uniform(0.0, 0.5)
+                    total_wait = wait + jitter
+                    print(
+                        f"RateLimitedModel sleeping {total_wait:.2f}s to respect RPM limit"
+                    )
+                    time.sleep(total_wait)
+                print(f"Model call attempt {attempt} of {self.max_retries}")
+                result = func(*args, **kwargs)
+                print(f"Model call attempt {attempt} succeeded")
+                return result
+            except Exception as e:
+                last_exception = e
+                error_str = str(e).lower()
+                # Check if this is a rate limit error (various ways it might be reported)
+                is_rate_limit = (
+                    isinstance(e, RateLimitError)
+                    or "rate limit" in error_str
+                    or "quota" in error_str
+                    or "429" in error_str
+                    or "resource_exhausted" in error_str
+                    or "too many requests" in error_str
+                )
+                if is_rate_limit:
+                    print(f"Rate limit error on attempt {attempt}: {e}")
+                    if attempt < self.max_retries:
+                        # Use a longer delay for rate limit errors
+                        delay = self.base_delay + random.uniform(0, 5)
+                        print(f"Waiting {delay:.1f}s before retry {attempt + 1}...")
+                        time.sleep(delay)
+                        continue
+                else:
+                    # Non-rate-limit error, don't retry
+                    print(f"Non-rate-limit error on attempt {attempt}: {e}")
+                    raise e
+        # All retries exhausted
+        print(f"All {self.max_retries} attempts failed. Raising last exception.")
+        raise last_exception
     def __call__(self, *args, **kwargs):
+        return self._call_with_retry(self._model, *args, **kwargs)
     def __getattr__(self, name: str):
         """Proxy attribute access to the underlying model.
         For callable attributes (like `generate`) we wrap the call so the
+        token-bucket rate limiter and retry logic are applied consistently.
         """
         # Avoid recursion
         if name.startswith("_"):
         if callable(attr):
             def wrapped(*args, **kwargs):
+                return self._call_with_retry(attr, *args, **kwargs)
             # Preserve original metadata where possible
             try:
         return attr
+# Wrap the model with a rate-limiter and retry logic. Default RPM is reduced to 8
 # but can be configured via the MODEL_RPM environment variable.
 _configured_rpm = int(os.getenv("MODEL_RPM", "8"))
 _configured_burst = None
+_configured_max_retries = int(os.getenv("MODEL_MAX_RETRIES", "10"))
+_configured_base_delay = float(os.getenv("MODEL_BASE_DELAY", "30.0"))
 model = RateLimitedModel(
     LiteLLMModel(model_id="gemini/gemini-2.5-flash", temperature=0.2),
     rpm=_configured_rpm,
     burst=_configured_burst,
+    max_retries=_configured_max_retries,
+    base_delay=_configured_base_delay,
 )
     def __call__(self, question: str) -> str:
         print(f"Agent received question (first 50 chars): {question[:50]}...")
+        print(f"Starting agent execution with model retry logic enabled...")
+        start_time = time.time()
         try:
+            # The retry logic is now handled at the model level within RateLimitedModel
+            # so we can call the agent directly
+            response = self.code_agent(question)
+            duration = time.time() - start_time
+            print(f"Agent completed successfully in {duration:.1f}s")
             print(f"Agent returning response: {response}")
             return response
         except Exception as e:
+            duration = time.time() - start_time
+            print(f"Error in code agent after {duration:.1f}s: {e}")
             return f"AGENT ERROR: {e}"
 def run_and_submit_all(profile: gr.OAuthProfile | None):