ariji1 commited on
Commit
a5c36c8
·
verified ·
1 Parent(s): 4507e80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -24
app.py CHANGED
@@ -1,29 +1,21 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
- from llama_cpp import Llama
4
- import os
5
- import subprocess
6
-
7
- MODEL_PATH = "./model/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
8
- MODEL_URL = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
9
-
10
- # Download model if not already present
11
- os.makedirs("model", exist_ok=True)
12
- if not os.path.exists(MODEL_PATH):
13
- print("Downloading model...")
14
- subprocess.run(["wget", MODEL_URL, "-O", MODEL_PATH], check=True)
15
-
16
- # Load the model
17
- llm = Llama(
18
- model_path=MODEL_PATH,
19
- n_ctx=2048,
20
- n_threads=2,
21
- n_batch=64,
22
- use_mlock=True
23
  )
24
 
25
- # FastAPI app
26
- app = FastAPI(title="Mistral GGUF LLM API", version="1.0.0")
27
 
28
  class InferenceRequest(BaseModel):
29
  prompt: str
@@ -35,8 +27,8 @@ class InferenceResponse(BaseModel):
35
  @app.post("/infer", response_model=InferenceResponse)
36
  def infer(req: InferenceRequest):
37
  try:
38
- result = llm(req.prompt, max_tokens=req.max_tokens, stop=["</s>"])
39
- return InferenceResponse(output=result["choices"][0]["text"].strip())
40
  except Exception as e:
41
  return InferenceResponse(output=f"Error generating response: {str(e)}")
42
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
+ from ctransformers import AutoModelForCausalLM
4
+
5
+ # Model configuration for ctransformers (CPU-friendly)
6
+ MODEL_REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
7
+ MODEL_FILE = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
8
+
9
+ # Load the model once at startup
10
+ llm = AutoModelForCausalLM.from_pretrained(
11
+ MODEL_REPO_ID,
12
+ model_file=MODEL_FILE,
13
+ model_type="mistral",
14
+ gpu_layers=0,
15
+ context_length=2048,
 
 
 
 
 
 
 
16
  )
17
 
18
+ app = FastAPI(title="Mistral GGUF LLM API (ctransformers)", version="1.0.0")
 
19
 
20
  class InferenceRequest(BaseModel):
21
  prompt: str
 
27
  @app.post("/infer", response_model=InferenceResponse)
28
  def infer(req: InferenceRequest):
29
  try:
30
+ generated_text = llm(req.prompt, max_new_tokens=req.max_tokens)
31
+ return InferenceResponse(output=str(generated_text).strip())
32
  except Exception as e:
33
  return InferenceResponse(output=f"Error generating response: {str(e)}")
34