Spaces:

mgbam
/

builder

Running

App Files Files Community

mgbam commited on Jul 25

Commit

639177c

verified ·

1 Parent(s): 4610542

Update inference.py

Browse files

Files changed (1) hide show

inference.py +62 -28

inference.py CHANGED Viewed

@@ -1,37 +1,67 @@
 # inference.py
-from typing import List, Dict, Optional
 from hf_client import get_inference_client
 from models import find_model
 def chat_completion(
     model_id: str,
     messages: List[Dict[str, str]],
     provider: Optional[str] = None,
-    max_tokens: int = 4096
 ) -> str:
     """
-    Send a chat completion request to the appropriate inference provider.
-    Args:
-        model_id: The model identifier to use.
-        messages: A list of OpenAI-style {'role','content'} messages.
-        provider: Optional override for provider; uses model default if None.
-        max_tokens: Maximum tokens to generate.
-    Returns:
-        The assistant's response content.
     """
-    # resolve default provider from registry if needed
-    if provider is None:
-        meta = find_model(model_id)
-        provider = meta.default_provider if meta else "auto"
-    client = get_inference_client(model_id, provider)
     resp = client.chat.completions.create(
         model=model_id,
         messages=messages,
-        max_tokens=max_tokens
     )
     return resp.choices[0].message.content
@@ -40,24 +70,28 @@ def stream_chat_completion(
     model_id: str,
     messages: List[Dict[str, str]],
     provider: Optional[str] = None,
-    max_tokens: int = 4096
-):
     """
-    Generator for streaming chat completions.
-    Yields partial message chunks as strings.
-    """
-    if provider is None:
-        meta = find_model(model_id)
-        provider = meta.default_provider if meta else "auto"
-    client = get_inference_client(model_id, provider)
     stream = client.chat.completions.create(
         model=model_id,
         messages=messages,
         max_tokens=max_tokens,
-        stream=True
     )
     for chunk in stream:
-        delta = getattr(chunk.choices[0].delta, "content", None)
         if delta:
             yield delta

 # inference.py
+# -------------------------------------------------------------
+# Unified wrapper around hf_client.get_inference_client
+# with automatic provider‑routing based on model registry
+# (see models.py) and graceful fall‑back to Groq.
+# -------------------------------------------------------------
+from __future__ import annotations
+from typing import Dict, Generator, List, Optional
 from hf_client import get_inference_client
 from models import find_model
+# ------------------------------------------------------------------
+# Helpers
+# ------------------------------------------------------------------
+def _resolve_provider(model_id: str, override: str | None) -> str:
+    """
+    Decide which provider to use.
+    Priority:
+    1. Explicit *override* arg supplied by caller.
+    2. Model registry default_provider (see models.py).
+    3. "auto" – lets HF route to the first available provider.
+    """
+    if override:
+        return override
+    meta = find_model(model_id)
+    return getattr(meta, "default_provider", "auto") if meta else "auto"
+# ------------------------------------------------------------------
+# Public API
+# ------------------------------------------------------------------
 def chat_completion(
     model_id: str,
     messages: List[Dict[str, str]],
     provider: Optional[str] = None,
+    max_tokens: int = 4096,
+    **kwargs,
 ) -> str:
     """
+    Blocking convenience wrapper – returns the full assistant reply.
+    Parameters
+    ----------
+    model_id      : HF or provider‑qualified model path (e.g. "openai/gpt-4").
+    messages      : OpenAI‑style [{'role': ..., 'content': ...}, …].
+    provider      : Optional provider override; otherwise auto‑resolved.
+    max_tokens    : Token budget for generation.
+    kwargs        : Forward‑compatible extra arguments (temperature, etc.).
+    Returns
+    -------
+    str – assistant message content.
     """
+    client = get_inference_client(model_id, _resolve_provider(model_id, provider))
     resp = client.chat.completions.create(
         model=model_id,
         messages=messages,
+        max_tokens=max_tokens,
+        **kwargs,
     )
     return resp.choices[0].message.content
     model_id: str,
     messages: List[Dict[str, str]],
     provider: Optional[str] = None,
+    max_tokens: int = 4096,
+    **kwargs,
+) -> Generator[str, None, None]:
     """
+    Yield the assistant response *incrementally*.
+    Example
+    -------
+    >>> for chunk in stream_chat_completion(model, msgs):
+    ...     print(chunk, end='', flush=True)
+    """
+    client = get_inference_client(model_id, _resolve_provider(model_id, provider))
     stream = client.chat.completions.create(
         model=model_id,
         messages=messages,
         max_tokens=max_tokens,
+        stream=True,
+        **kwargs,
     )
+    # HF Inference returns chunks with .choices[0].delta.content
     for chunk in stream:
+        delta: str | None = getattr(chunk.choices[0].delta, "content", None)
         if delta:
             yield delta