Robostar commited on
Commit
7b02262
·
verified ·
1 Parent(s): 1db6497

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -5,25 +5,29 @@ import torch
5
 
6
  app = FastAPI()
7
 
 
8
  # Charger le modèle et le tokenizer
9
  #model_name = "mistralai/Mistral-7B-Instruct-v0.1" # Modèle Mistral 7B
10
  #model_name = "HuggingFaceH4/zephyr-3b"
11
  #model_name = "serkanarslan/mistral-7b-mini-ft"
12
- model_name = "microsoft/phi-2"
13
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
14
  model = AutoModelForCausalLM.from_pretrained(
15
  model_name,
16
- torch_dtype=torch.float16,
17
- device_map="auto" # Utilise le GPU si dispo
18
- )
19
 
20
- # Définir le format des requêtes
21
  class ChatRequest(BaseModel):
22
  message: str
23
 
24
  @app.post("/chat")
25
  async def chat(request: ChatRequest):
26
- inputs = tokenizer(request.message, return_tensors="pt").to("cuda")
27
  output = model.generate(**inputs, max_length=100)
28
  response = tokenizer.decode(output[0], skip_special_tokens=True)
29
  return {"response": response}
 
5
 
6
  app = FastAPI()
7
 
8
+
9
  # Charger le modèle et le tokenizer
10
  #model_name = "mistralai/Mistral-7B-Instruct-v0.1" # Modèle Mistral 7B
11
  #model_name = "HuggingFaceH4/zephyr-3b"
12
  #model_name = "serkanarslan/mistral-7b-mini-ft"
13
+ # Choose a smaller model for free-tier
14
+ #model_name = "microsoft/phi-2"
15
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # You can switch to Phi-2, OpenChat, etc.
16
+
17
+ # Load tokenizer & model
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
19
  model = AutoModelForCausalLM.from_pretrained(
20
  model_name,
21
+ torch_dtype=torch.float32, # Use float32 for CPU compatibility
22
+ ).to("cpu") # Force CPU use
 
23
 
24
+ # Request format
25
  class ChatRequest(BaseModel):
26
  message: str
27
 
28
  @app.post("/chat")
29
  async def chat(request: ChatRequest):
30
+ inputs = tokenizer(request.message, return_tensors="pt").to("cpu") # Send input to CPU
31
  output = model.generate(**inputs, max_length=100)
32
  response = tokenizer.decode(output[0], skip_special_tokens=True)
33
  return {"response": response}