oleksandrfluxon
commited on
Commit
•
76a36d6
1
Parent(s):
37b0e2d
Update handler.py
Browse files- handler.py +14 -3
handler.py
CHANGED
@@ -7,11 +7,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
7 |
class EndpointHandler:
|
8 |
def __init__(self, path=""):
|
9 |
# load model and tokenizer from path
|
10 |
-
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
self.model = AutoModelForCausalLM.from_pretrained(
|
12 |
-
path,
|
|
|
|
|
|
|
13 |
)
|
14 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
15 |
|
16 |
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
|
17 |
# process input
|
|
|
7 |
class EndpointHandler:
|
8 |
def __init__(self, path=""):
|
9 |
# load model and tokenizer from path
|
10 |
+
self.tokenizer = AutoTokenizer.from_pretrained(path) # AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
11 |
+
|
12 |
+
config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
|
13 |
+
config.attn_config['attn_impl'] = 'triton'
|
14 |
+
config.init_device = 'cuda:0' # For fast initialization directly on GPU!
|
15 |
+
config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
|
16 |
+
|
17 |
self.model = AutoModelForCausalLM.from_pretrained(
|
18 |
+
path,
|
19 |
+
config,
|
20 |
+
torch_dtype=torch.float16,
|
21 |
+
trust_remote_code=True
|
22 |
)
|
23 |
+
# self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
24 |
+
self.device = 'cuda'
|
25 |
+
print('===> device', device)
|
26 |
|
27 |
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
|
28 |
# process input
|