Spaces:

Maximofn
/

SmolLM2_backend

Build error

Maximofn commited on Mar 2

Commit

34aabf7

1 Parent(s): 167addd

Switch from HuggingFace InferenceClient to local model loading

- Replace InferenceClient with local model loading using transformers library
- Use AutoModelForCausalLM and AutoTokenizer for direct model initialization
- Create a text generation pipeline with custom generation parameters
- Modify call_model function to work with local model generation
- Improve token and model loading logging

Files changed (1) hide show

app.py +39 -20

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from huggingface_hub import InferenceClient
 from langchain_core.messages import HumanMessage, AIMessage
 from langgraph.checkpoint.memory import MemorySaver
@@ -10,19 +10,41 @@ import os
 from dotenv import load_dotenv
 load_dotenv()
 HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.getenv("HUGGINGFACE_TOKEN"))
-print(HUGGINGFACE_TOKEN)
-# Initialize the HuggingFace model
-model = InferenceClient(
-    model="HuggingFaceTB/SmolLM2-1.7B-Instruct",
-    api_key=os.getenv("HUGGINGFACE_TOKEN")
 )
 # Define the function that calls the model
 def call_model(state: MessagesState):
     """
-    Call the model with the given messages
     Args:
         state: MessagesState
@@ -30,24 +52,21 @@ def call_model(state: MessagesState):
     Returns:
         dict: A dictionary containing the generated text and the thread ID
     """
-    # Convert LangChain messages to HuggingFace format
-    hf_messages = []
     for msg in state["messages"]:
         if isinstance(msg, HumanMessage):
-            hf_messages.append({"role": "user", "content": msg.content})
         elif isinstance(msg, AIMessage):
-            hf_messages.append({"role": "assistant", "content": msg.content})
-    # Call the API
-    response = model.chat_completion(
-        messages=hf_messages,
-        temperature=0.5,
-        max_tokens=64,
-        top_p=0.7
-    )
-    # Convert the response to LangChain format
-    ai_message = AIMessage(content=response.choices[0].message.content)
     return {"messages": state["messages"] + [ai_message]}
 # Define the graph

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from langchain_core.messages import HumanMessage, AIMessage
 from langgraph.checkpoint.memory import MemorySaver
 from dotenv import load_dotenv
 load_dotenv()
+# HuggingFace token
 HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.getenv("HUGGINGFACE_TOKEN"))
+print(f"Token HuggingFace: {HUGGINGFACE_TOKEN}")
+# Model to use
+MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
+# Initialize the model and tokenizer locally with authentication
+print(f"Loading model {MODEL_NAME} locally...")
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_NAME,
+    token=HUGGINGFACE_TOKEN  # Add token for authentication
+)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    device_map="auto",
+    token=HUGGINGFACE_TOKEN  # Add token for authentication
+)
+# Create a pipeline to facilitate generation
+generator = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=64,
+    do_sample=True,
+    temperature=0.5,
+    top_p=0.7,
+    pad_token_id=tokenizer.eos_token_id
 )
 # Define the function that calls the model
 def call_model(state: MessagesState):
     """
+    Call the local model with the given messages
     Args:
         state: MessagesState
     Returns:
         dict: A dictionary containing the generated text and the thread ID
     """
+    # Convert LangChain messages to a format that the local model can understand
+    prompt = ""
     for msg in state["messages"]:
         if isinstance(msg, HumanMessage):
+            prompt += f"User: {msg.content}\n"
         elif isinstance(msg, AIMessage):
+            prompt += f"Assistant: {msg.content}\n"
+    prompt += "Assistant: "
+    # Generate response with the local model
+    response = generator(prompt, return_full_text=False)[0]['generated_text']
+    # Convert the response to the LangChain format
+    ai_message = AIMessage(content=response.strip())
     return {"messages": state["messages"] + [ai_message]}
 # Define the graph