Spaces:
Runtime error
Runtime error
ncoop57
commited on
Commit
·
bab8078
1
Parent(s):
b399543
Get minimum working openai server
Browse files- .gitignore +1 -0
- app.py +30 -4
- utils/codegen.py +25 -140
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
app.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import logging
|
2 |
import os
|
3 |
-
|
|
|
|
|
|
|
4 |
import uvicorn
|
5 |
from fastapi import FastAPI, Request, Response
|
6 |
from fastapi.responses import JSONResponse
|
@@ -8,12 +11,18 @@ from sse_starlette.sse import EventSourceResponse
|
|
8 |
|
9 |
from config.log_config import uvicorn_logger
|
10 |
from models import OpenAIinput
|
11 |
-
from utils.
|
12 |
from utils.errors import FauxPilotException
|
|
|
13 |
|
14 |
logging.config.dictConfig(uvicorn_logger)
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
codegen = CodeGenProxy(
|
19 |
host=os.environ.get("TRITON_HOST", "triton"),
|
@@ -42,7 +51,24 @@ async def completions(data: OpenAIinput):
|
|
42 |
data = data.dict()
|
43 |
try:
|
44 |
content = codegen(data=data)
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
raise FauxPilotException(
|
47 |
message=str(E),
|
48 |
type="invalid_request_error",
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
+
import torch
|
4 |
+
import json
|
5 |
+
import torch
|
6 |
+
import time
|
7 |
import uvicorn
|
8 |
from fastapi import FastAPI, Request, Response
|
9 |
from fastapi.responses import JSONResponse
|
|
|
11 |
|
12 |
from config.log_config import uvicorn_logger
|
13 |
from models import OpenAIinput
|
14 |
+
from utils.codegen import CodeGenProxy
|
15 |
from utils.errors import FauxPilotException
|
16 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
17 |
|
18 |
logging.config.dictConfig(uvicorn_logger)
|
19 |
|
20 |
+
# token = os.environ.get("HUB_TOKEN", None)
|
21 |
+
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
22 |
+
|
23 |
+
# tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
|
24 |
+
# model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device)
|
25 |
+
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
|
26 |
|
27 |
codegen = CodeGenProxy(
|
28 |
host=os.environ.get("TRITON_HOST", "triton"),
|
|
|
51 |
data = data.dict()
|
52 |
try:
|
53 |
content = codegen(data=data)
|
54 |
+
# prompt = data.get("prompt")
|
55 |
+
# choices = [pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']]
|
56 |
+
# completion = {
|
57 |
+
# 'id': None, # fill in
|
58 |
+
# 'model': 'codegen',
|
59 |
+
# 'object': 'text_completion',
|
60 |
+
# 'created': int(time.time()),
|
61 |
+
# 'choices': None, # fill in
|
62 |
+
# 'usage': {
|
63 |
+
# 'completion_tokens': int(sum([len(c.split()) for c in choices])),
|
64 |
+
# 'prompt_tokens': int(len(prompt.split())),
|
65 |
+
# 'total_tokens': int(sum([len(c.split()) for c in choices]) + len(prompt.split())),
|
66 |
+
# }
|
67 |
+
# }
|
68 |
+
# completion['id'] = 10
|
69 |
+
# completion['choices'] = choices
|
70 |
+
# content = json.dumps(completion)
|
71 |
+
except Exception as E:
|
72 |
raise FauxPilotException(
|
73 |
message=str(E),
|
74 |
type="invalid_request_error",
|
utils/codegen.py
CHANGED
@@ -2,19 +2,28 @@ import json
|
|
2 |
import random
|
3 |
import string
|
4 |
import time
|
5 |
-
|
|
|
6 |
import numpy as np
|
7 |
import tritonclient.grpc as client_util
|
8 |
from tokenizers import Tokenizer
|
9 |
from tritonclient.utils import np_to_triton_dtype, InferenceServerException
|
|
|
|
|
10 |
|
11 |
np.finfo(np.dtype("float32"))
|
12 |
np.finfo(np.dtype("float64"))
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
class CodeGenProxy:
|
16 |
def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
|
17 |
-
self.tokenizer =
|
18 |
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
|
19 |
self.PAD_CHAR = 50256
|
20 |
|
@@ -48,7 +57,7 @@ class CodeGenProxy:
|
|
48 |
item_offsets = []
|
49 |
|
50 |
for word in word_dict_item:
|
51 |
-
ids = tokenizer.encode(word)
|
52 |
|
53 |
if len(ids) == 0:
|
54 |
continue
|
@@ -73,144 +82,20 @@ class CodeGenProxy:
|
|
73 |
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
74 |
|
75 |
def generate(self, data):
|
|
|
76 |
prompt = data['prompt']
|
77 |
n = data.get('n', 1)
|
78 |
model_name = data["model"]
|
79 |
-
|
80 |
-
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
81 |
-
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
82 |
-
|
83 |
-
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
84 |
-
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
85 |
-
prompt_len = input_start_ids.shape[1]
|
86 |
-
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
87 |
-
max_tokens = data.get('max_tokens', 16)
|
88 |
-
prompt_tokens: int = input_len[0][0]
|
89 |
-
requested_tokens = max_tokens + prompt_tokens
|
90 |
-
if requested_tokens > self.MAX_MODEL_LEN:
|
91 |
-
print(1)
|
92 |
-
raise self.TokensExceedsMaximum(
|
93 |
-
f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
|
94 |
-
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
|
95 |
-
f"Please reduce your prompt; or completion length."
|
96 |
-
)
|
97 |
-
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
98 |
-
num_logprobs = data.get('logprobs', -1)
|
99 |
-
if num_logprobs is None:
|
100 |
-
num_logprobs = 1
|
101 |
-
want_logprobs = num_logprobs > 0
|
102 |
-
|
103 |
-
temperature = data.get('temperature', 0.2)
|
104 |
-
if temperature == 0.0:
|
105 |
-
temperature = 1.0
|
106 |
-
top_k = 1
|
107 |
-
else:
|
108 |
-
top_k = data.get('top_k', 0)
|
109 |
-
|
110 |
-
top_p = data.get('top_p', 1.0)
|
111 |
-
frequency_penalty = data.get('frequency_penalty', 1.0)
|
112 |
-
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
113 |
-
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
114 |
-
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
115 |
-
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
116 |
-
temperature = temperature * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
117 |
-
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
118 |
-
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
119 |
-
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
120 |
-
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
121 |
-
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
122 |
-
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
123 |
-
|
124 |
-
stop_words = data.get('stop', [])
|
125 |
-
if stop_words is None:
|
126 |
-
stop_words = []
|
127 |
-
if stop_words:
|
128 |
-
stop_word_list = np.repeat(self.to_word_list_format([stop_words], self.tokenizer), input_start_ids.shape[0],
|
129 |
-
axis=0)
|
130 |
-
else:
|
131 |
-
stop_word_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
|
132 |
-
np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
|
133 |
-
|
134 |
-
# Not used
|
135 |
-
bad_words_list = np.concatenate([np.zeros([input_start_ids.shape[0], 1, 1]).astype(
|
136 |
-
np.int32), (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)], axis=1)
|
137 |
-
|
138 |
-
inputs = [
|
139 |
-
self.prepare_tensor("input_ids", input_start_ids),
|
140 |
-
self.prepare_tensor("input_lengths", input_len),
|
141 |
-
self.prepare_tensor("request_output_len", output_len),
|
142 |
-
self.prepare_tensor("runtime_top_k", runtime_top_k),
|
143 |
-
self.prepare_tensor("runtime_top_p", runtime_top_p),
|
144 |
-
self.prepare_tensor("beam_search_diversity_rate", beam_search_diversity_rate),
|
145 |
-
self.prepare_tensor("random_seed", random_seed),
|
146 |
-
self.prepare_tensor("temperature", temperature),
|
147 |
-
self.prepare_tensor("len_penalty", len_penalty),
|
148 |
-
self.prepare_tensor("repetition_penalty", repetition_penalty),
|
149 |
-
self.prepare_tensor("is_return_log_probs", is_return_log_probs),
|
150 |
-
self.prepare_tensor("beam_width", beam_width),
|
151 |
-
self.prepare_tensor("start_id", start_ids),
|
152 |
-
self.prepare_tensor("end_id", end_ids),
|
153 |
-
self.prepare_tensor("bad_words_list", bad_words_list),
|
154 |
-
self.prepare_tensor("stop_words_list", stop_word_list),
|
155 |
-
]
|
156 |
-
|
157 |
-
result = self.client.infer(model_name, inputs)
|
158 |
-
|
159 |
-
output_data = result.as_numpy("output_ids")
|
160 |
-
if output_data is None:
|
161 |
-
raise RuntimeError("No output data")
|
162 |
-
|
163 |
-
# All of these squeeze(1)s are to remove the beam width dimension.
|
164 |
-
output_data = output_data.squeeze(1)
|
165 |
-
if want_logprobs:
|
166 |
-
lp_data = result.as_numpy("output_log_probs").squeeze(1)
|
167 |
-
# clp_data = result.as_numpy("cum_log_probs").squeeze(1)
|
168 |
-
else:
|
169 |
-
lp_data = [None] * output_data.shape[0]
|
170 |
-
sequence_lengths = result.as_numpy("sequence_length").squeeze(1)
|
171 |
-
gen_len = sequence_lengths - input_len.squeeze(1)
|
172 |
-
|
173 |
-
decoded = self.tokenizer.decode_batch([out[prompt_len:prompt_len + g] for g, out in zip(gen_len, output_data)])
|
174 |
-
trimmed = [self.trim_with_stopwords(d, stop_words) for d in decoded]
|
175 |
-
|
176 |
choices = []
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
for ii, t in enumerate(tokens_str):
|
186 |
-
fakedict = {}
|
187 |
-
top_token_lp = float(lps[ii])
|
188 |
-
fakedict[t] = top_token_lp
|
189 |
-
while len(fakedict) < num_logprobs:
|
190 |
-
random_token = random.randint(0, self.tokenizer.get_vocab_size() - 1)
|
191 |
-
random_token_str = self.tokenizer.decode([random_token])
|
192 |
-
if random_token_str in fakedict:
|
193 |
-
continue
|
194 |
-
random_token_lp = top_token_lp - random.random()
|
195 |
-
fakedict[random_token_str] = random_token_lp
|
196 |
-
top_logprobs.append(fakedict)
|
197 |
-
|
198 |
-
lpdict = {
|
199 |
-
'token_logprobs': lps.tolist(),
|
200 |
-
'top_logprobs': top_logprobs,
|
201 |
-
'tokens': tokens_str,
|
202 |
-
'text_offset': offsets,
|
203 |
-
}
|
204 |
-
else:
|
205 |
-
lpdict = None
|
206 |
-
|
207 |
-
choice = {
|
208 |
-
'text': text,
|
209 |
-
'index': i,
|
210 |
-
'finish_reason': reason,
|
211 |
-
'logprobs': lpdict,
|
212 |
-
}
|
213 |
-
choices.append(choice)
|
214 |
|
215 |
completion = {
|
216 |
'id': None, # fill in
|
@@ -219,9 +104,9 @@ class CodeGenProxy:
|
|
219 |
'created': int(time.time()),
|
220 |
'choices': None, # fill in
|
221 |
'usage': {
|
222 |
-
'completion_tokens': int(
|
223 |
-
'prompt_tokens': int(
|
224 |
-
'total_tokens': int(
|
225 |
}
|
226 |
}
|
227 |
return completion, choices
|
|
|
2 |
import random
|
3 |
import string
|
4 |
import time
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
import numpy as np
|
8 |
import tritonclient.grpc as client_util
|
9 |
from tokenizers import Tokenizer
|
10 |
from tritonclient.utils import np_to_triton_dtype, InferenceServerException
|
11 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
12 |
+
|
13 |
|
14 |
np.finfo(np.dtype("float32"))
|
15 |
np.finfo(np.dtype("float64"))
|
16 |
|
17 |
+
token = os.environ.get("HUB_TOKEN", None)
|
18 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
|
21 |
+
model = AutoModelForCausalLM.from_pretrained("bigcode/christmas-models", trust_remote_code=True, use_auth_token=token).to(device)
|
22 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
|
23 |
|
24 |
class CodeGenProxy:
|
25 |
def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
|
26 |
+
self.tokenizer = AutoTokenizer.from_pretrained("bigcode/christmas-models", use_auth_token=token)
|
27 |
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
|
28 |
self.PAD_CHAR = 50256
|
29 |
|
|
|
57 |
item_offsets = []
|
58 |
|
59 |
for word in word_dict_item:
|
60 |
+
ids = tokenizer.encode(word)
|
61 |
|
62 |
if len(ids) == 0:
|
63 |
continue
|
|
|
82 |
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
83 |
|
84 |
def generate(self, data):
|
85 |
+
global pipe
|
86 |
prompt = data['prompt']
|
87 |
n = data.get('n', 1)
|
88 |
model_name = data["model"]
|
89 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
choices = []
|
91 |
+
text = pipe(prompt, do_sample=True, top_p=0.95, max_new_tokens=50)[0]['generated_text']
|
92 |
+
choice = {
|
93 |
+
'text': text,
|
94 |
+
'index': 0,
|
95 |
+
'finish_reason': "stop",
|
96 |
+
'logprobs': None,
|
97 |
+
}
|
98 |
+
choices.append(choice)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
completion = {
|
101 |
'id': None, # fill in
|
|
|
104 |
'created': int(time.time()),
|
105 |
'choices': None, # fill in
|
106 |
'usage': {
|
107 |
+
'completion_tokens': int(50),
|
108 |
+
'prompt_tokens': int(50),
|
109 |
+
'total_tokens': int(100),
|
110 |
}
|
111 |
}
|
112 |
return completion, choices
|