Gary Simmons
commited on
Commit
·
4e00399
1
Parent(s):
561fce6
reduce default RPM in RateLimitedModel to 8 and update backoff_multiplier to 4 for enhanced retry logic
Browse files
app.py
CHANGED
|
@@ -61,7 +61,7 @@ class TokenBucketRateLimiter:
|
|
| 61 |
class RateLimitedModel:
|
| 62 |
"""Wraps a model-like callable and enforces a TokenBucketRateLimiter before each call."""
|
| 63 |
|
| 64 |
-
def __init__(self, model_obj, rpm: int =
|
| 65 |
self._model = model_obj
|
| 66 |
# rpm -> tokens per minute
|
| 67 |
capacity = burst if burst is not None else max(1, rpm)
|
|
@@ -118,9 +118,9 @@ class RateLimitedModel:
|
|
| 118 |
return attr
|
| 119 |
|
| 120 |
|
| 121 |
-
# Wrap the model with a rate-limiter. Default RPM is
|
| 122 |
# but can be configured via the MODEL_RPM environment variable.
|
| 123 |
-
_configured_rpm = int(os.getenv("MODEL_RPM", "
|
| 124 |
_configured_burst = None
|
| 125 |
model = RateLimitedModel(
|
| 126 |
LiteLLMModel(model_id="gemini/gemini-2.5-flash", temperature=0.2),
|
|
@@ -162,12 +162,16 @@ class BasicAgent:
|
|
| 162 |
return f"AGENT ERROR: {e}"
|
| 163 |
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def call_model_with_retry(
|
| 166 |
-
callable_fn, *args, max_retries=15, initial_delay=10.0, backoff_multiplier=
|
| 167 |
):
|
| 168 |
"""
|
| 169 |
Calls a function with retry logic and exponential backoff.
|
| 170 |
-
The backoff multiplier is configurable (default=
|
| 171 |
"""
|
| 172 |
delay = initial_delay
|
| 173 |
for attempt in range(1, max_retries + 1):
|
|
|
|
| 61 |
class RateLimitedModel:
|
| 62 |
"""Wraps a model-like callable and enforces a TokenBucketRateLimiter before each call."""
|
| 63 |
|
| 64 |
+
def __init__(self, model_obj, rpm: int = 8, burst: int | None = None):
|
| 65 |
self._model = model_obj
|
| 66 |
# rpm -> tokens per minute
|
| 67 |
capacity = burst if burst is not None else max(1, rpm)
|
|
|
|
| 118 |
return attr
|
| 119 |
|
| 120 |
|
| 121 |
+
# Wrap the model with a rate-limiter. Default RPM is reduced to 8
|
| 122 |
# but can be configured via the MODEL_RPM environment variable.
|
| 123 |
+
_configured_rpm = int(os.getenv("MODEL_RPM", "8"))
|
| 124 |
_configured_burst = None
|
| 125 |
model = RateLimitedModel(
|
| 126 |
LiteLLMModel(model_id="gemini/gemini-2.5-flash", temperature=0.2),
|
|
|
|
| 162 |
return f"AGENT ERROR: {e}"
|
| 163 |
|
| 164 |
|
| 165 |
+
# Note: The backoff_multiplier was changed from 3 to 4, which increases the delay between retries exponentially.
|
| 166 |
+
# This means that after each failed attempt, the wait time before the next retry will grow more rapidly,
|
| 167 |
+
# potentially resulting in significantly longer total retry durations.
|
| 168 |
+
|
| 169 |
def call_model_with_retry(
|
| 170 |
+
callable_fn, *args, max_retries=15, initial_delay=10.0, backoff_multiplier=4, **kwargs
|
| 171 |
):
|
| 172 |
"""
|
| 173 |
Calls a function with retry logic and exponential backoff.
|
| 174 |
+
The backoff multiplier is configurable (default=4 for more aggressive backoff).
|
| 175 |
"""
|
| 176 |
delay = initial_delay
|
| 177 |
for attempt in range(1, max_retries + 1):
|