moriire commited on
Commit
609ebbf
·
verified ·
1 Parent(s): c4894e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -10
app.py CHANGED
@@ -10,17 +10,26 @@ from pydantic import BaseModel
10
 
11
  class GenModel(BaseModel):
12
  question: str
13
- system: str = "You are a story writing assistant."
14
- temperature: float = 0.7
15
- seed: int = 42
16
 
17
- llama = llama_cpp.Llama.from_pretrained(
 
 
 
 
 
 
 
 
 
18
  repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
19
  filename="*q4_0.gguf",
20
  tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
21
  verbose=False,
22
  n_ctx=4096,
23
- n_gpu_layers=0,
24
  #chat_format="llama-2"
25
  )
26
  # Logger setup
@@ -67,14 +76,14 @@ def health():
67
  return {"status": "ok"}
68
 
69
  # Chat Completion API
70
- @app.post("/generate/")
71
  async def complete(gen:GenModel):
72
  try:
73
  messages=[
74
  {"role": "system", "content": gen.system},
75
  ]
76
  st = time()
77
- output = llama.create_chat_completion(
78
  messages = messages,
79
  temperature=gen.temperature,
80
  seed=gen.seed,
@@ -104,16 +113,16 @@ async def complete(gen:GenModel):
104
  )
105
 
106
  # Chat Completion API
107
- @app.get("/generate_stream")
108
  async def complete(
109
  question: str,
110
- system: str = "You are a professional medical assistant.",
111
  temperature: float = 0.7,
112
  seed: int = 42,
113
  ) -> dict:
114
  try:
115
  st = time()
116
- output = llama.create_chat_completion(
117
  messages=[
118
  {"role": "system", "content": system},
119
  {"role": "user", "content": question},
 
10
 
11
  class GenModel(BaseModel):
12
  question: str
13
+ system: str = "You are a professional medical assistant."
14
+ temperature: float = 0.8
15
+ seed: int = 101
16
 
17
+ llm_chat = llama_cpp.Llama.from_pretrained(
18
+ repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
19
+ filename="*q4_0.gguf",
20
+ tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
21
+ verbose=False,
22
+ n_ctx=1024,
23
+ n_gpu_layers=0,
24
+ #chat_format="llama-2"
25
+ )
26
+ llm_generate = llama_cpp.Llama.from_pretrained(
27
  repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
28
  filename="*q4_0.gguf",
29
  tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
30
  verbose=False,
31
  n_ctx=4096,
32
+ n_gpu_layers=0,
33
  #chat_format="llama-2"
34
  )
35
  # Logger setup
 
76
  return {"status": "ok"}
77
 
78
  # Chat Completion API
79
+ @app.post("/chat/")
80
  async def complete(gen:GenModel):
81
  try:
82
  messages=[
83
  {"role": "system", "content": gen.system},
84
  ]
85
  st = time()
86
+ output = llm_chat.create_chat_completion(
87
  messages = messages,
88
  temperature=gen.temperature,
89
  seed=gen.seed,
 
113
  )
114
 
115
  # Chat Completion API
116
+ @app.get("/generate")
117
  async def complete(
118
  question: str,
119
+ system: str = "You are an AI assistant.",
120
  temperature: float = 0.7,
121
  seed: int = 42,
122
  ) -> dict:
123
  try:
124
  st = time()
125
+ output = llm_generate.create_chat_completion(
126
  messages=[
127
  {"role": "system", "content": system},
128
  {"role": "user", "content": question},