Spaces:

Maximofn
/

SmolLM2_backend_LocalModel

Build error

Maximofn commited on Mar 3

Commit

eae8d77

1 Parent(s): 8e2c98e

Replace HuggingFace Inference API with local Transformers model loading

- Switch from HuggingFace Inference Client to local model loading
- Use SmolLM2-1.7B-Instruct model instead of Qwen/Qwen2.5-72B-Instruct
- Add device detection and model loading with torch.bfloat16
- Update model calling logic to use local model generation
- Improve token generation parameters
- Add print statements for model loading confirmation

Files changed (1) hide show

app.py +37 -19

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from huggingface_hub import InferenceClient
 from langchain_core.messages import HumanMessage, AIMessage
 from langgraph.checkpoint.memory import MemorySaver
@@ -10,15 +11,21 @@ import os
 from dotenv import load_dotenv
 load_dotenv()
-# HuggingFace token
-HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", os.getenv("HUGGINGFACE_TOKEN"))
-# Initialize the HuggingFace model
-model = InferenceClient(
-    model="Qwen/Qwen2.5-72B-Instruct",
-    api_key=os.getenv("HUGGINGFACE_TOKEN")
 )
 # Define the function that calls the model
 def call_model(state: MessagesState):
     """
@@ -30,24 +37,35 @@ def call_model(state: MessagesState):
     Returns:
         dict: A dictionary containing the generated text and the thread ID
     """
-    # Convert LangChain messages to HuggingFace format
-    hf_messages = []
     for msg in state["messages"]:
         if isinstance(msg, HumanMessage):
-            hf_messages.append({"role": "user", "content": msg.content})
         elif isinstance(msg, AIMessage):
-            hf_messages.append({"role": "assistant", "content": msg.content})
-    # Call the API
-    response = model.chat_completion(
-        messages=hf_messages,
-        temperature=0.5,
-        max_tokens=64,
-        top_p=0.7
     )
     # Convert the response to LangChain format
-    ai_message = AIMessage(content=response.choices[0].message.content)
     return {"messages": state["messages"] + [ai_message]}
 # Define the graph

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 from langchain_core.messages import HumanMessage, AIMessage
 from langgraph.checkpoint.memory import MemorySaver
 from dotenv import load_dotenv
 load_dotenv()
+# Initialize the model and tokenizer
+print("Cargando modelo y tokenizer...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_name = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
+# Load the model in BF16 format for better performance and lower memory usage
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"  # This will automatically distribute the model across available GPUs
 )
+print(f"Modelo cargado en dispositivo: {device}")
 # Define the function that calls the model
 def call_model(state: MessagesState):
     """
     Returns:
         dict: A dictionary containing the generated text and the thread ID
     """
+    # Convert LangChain messages to chat format
+    messages = []
     for msg in state["messages"]:
         if isinstance(msg, HumanMessage):
+            messages.append({"role": "user", "content": msg.content})
         elif isinstance(msg, AIMessage):
+            messages.append({"role": "assistant", "content": msg.content})
+    # Prepare the input using the chat template
+    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    # Generate response
+    outputs = model.generate(
+        inputs,
+        max_new_tokens=512,  # Increase the number of tokens for longer responses
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
     )
+    # Decode and clean the response
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract only the assistant's response (after the last user message)
+    response = response.split("Assistant:")[-1].strip()
     # Convert the response to LangChain format
+    ai_message = AIMessage(content=response)
     return {"messages": state["messages"] + [ai_message]}
 # Define the graph