Spaces:

Maximofn
/

SmolLM2_backend

Build error

Maximofn commited on Mar 2

Commit

e55b892

1 Parent(s): 543aa6c

Switch from local SmolLM2 model to Qwen2.5-72B-Instruct via HuggingFace Inference API

- Replace local model loading with HuggingFace InferenceClient
- Update call_model function to use new API-based chat completion method
- Simplify model initialization and remove local model loading code
- Switch from SmolLM2-135M-Instruct to Qwen2.5-72B-Instruct model

Files changed (1) hide show

app.py +20 -48

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from langchain_core.messages import HumanMessage, AIMessage
 from langgraph.checkpoint.memory import MemorySaver
@@ -10,47 +10,19 @@ import os
 from dotenv import load_dotenv
 load_dotenv()
-# Configure cache directory
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
-os.environ["HF_HOME"] = "/tmp/hf_home"
-# HuggingFace token
 HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.getenv("HUGGINGFACE_TOKEN"))
-print(f"Token HuggingFace: {HUGGINGFACE_TOKEN}")
-# Model to use
-MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
-# Initialize the model and tokenizer locally with authentication
-print(f"Loading model {MODEL_NAME} locally...")
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_NAME,
-    token=HUGGINGFACE_TOKEN,  # Add token for authentication
-    cache_dir="/tmp/transformers_cache"  # Specify cache directory
-)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    device_map="auto",
-    token=HUGGINGFACE_TOKEN,  # Add token for authentication
-    cache_dir="/tmp/transformers_cache"  # Specify cache directory
-)
-# Create a pipeline to facilitate generation
-generator = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=64,
-    do_sample=True,
-    temperature=0.5,
-    top_p=0.7,
-    pad_token_id=tokenizer.eos_token_id
 )
 # Define the function that calls the model
 def call_model(state: MessagesState):
     """
-    Call the local model with the given messages
     Args:
         state: MessagesState
@@ -58,24 +30,24 @@ def call_model(state: MessagesState):
     Returns:
         dict: A dictionary containing the generated text and the thread ID
     """
-    # System prompt to guide the model's behavior
-    system_prompt = "You are a friendly Chatbot. Always reply in the language in which the user is writing to you."
-    # Convert LangChain messages to a format that the local model can understand
-    prompt = f"System: {system_prompt}\n\n"
     for msg in state["messages"]:
         if isinstance(msg, HumanMessage):
-            prompt += f"User: {msg.content}\n"
         elif isinstance(msg, AIMessage):
-            prompt += f"Assistant: {msg.content}\n"
-    prompt += "Assistant: "
-    # Generate response with the local model
-    response = generator(prompt, return_full_text=False)[0]['generated_text']
-    # Convert the response to the LangChain format
-    ai_message = AIMessage(content=response.strip())
     return {"messages": state["messages"] + [ai_message]}
 # Define the graph

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from huggingface_hub import InferenceClient
 from langchain_core.messages import HumanMessage, AIMessage
 from langgraph.checkpoint.memory import MemorySaver
 from dotenv import load_dotenv
 load_dotenv()
 HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.getenv("HUGGINGFACE_TOKEN"))
+print(HUGGINGFACE_TOKEN)
+# Initialize the HuggingFace model
+model = InferenceClient(
+    model="Qwen/Qwen2.5-72B-Instruct",
+    api_key=os.getenv("HUGGINGFACE_TOKEN")
 )
 # Define the function that calls the model
 def call_model(state: MessagesState):
     """
+    Call the model with the given messages
     Args:
         state: MessagesState
     Returns:
         dict: A dictionary containing the generated text and the thread ID
     """
+    # Convert LangChain messages to HuggingFace format
+    hf_messages = []
     for msg in state["messages"]:
         if isinstance(msg, HumanMessage):
+            hf_messages.append({"role": "user", "content": msg.content})
         elif isinstance(msg, AIMessage):
+            hf_messages.append({"role": "assistant", "content": msg.content})
+    # Call the API
+    response = model.chat_completion(
+        messages=hf_messages,
+        temperature=0.5,
+        max_tokens=64,
+        top_p=0.7
+    )
+    # Convert the response to LangChain format
+    ai_message = AIMessage(content=response.choices[0].message.content)
     return {"messages": state["messages"] + [ai_message]}
 # Define the graph