oleksandrfluxon
commited on
Commit
•
b4751ce
1
Parent(s):
4c526ae
Update handler.py
Browse files- handler.py +2 -3
handler.py
CHANGED
@@ -5,13 +5,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
|
5 |
|
6 |
|
7 |
class EndpointHandler:
|
8 |
-
def __init__(self, path=""
|
9 |
-
print('===> __init__', path, unused)
|
10 |
# load model and tokenizer from path
|
11 |
self.tokenizer = AutoTokenizer.from_pretrained(path) # AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
12 |
|
13 |
config = AutoConfig.from_pretrained(path, trust_remote_code=True)
|
14 |
-
config.attn_config['attn_impl'] = 'triton'
|
15 |
config.init_device = 'cuda:0' # For fast initialization directly on GPU!
|
16 |
config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
|
17 |
|
|
|
5 |
|
6 |
|
7 |
class EndpointHandler:
|
8 |
+
def __init__(self, path=""):
|
|
|
9 |
# load model and tokenizer from path
|
10 |
self.tokenizer = AutoTokenizer.from_pretrained(path) # AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
|
11 |
|
12 |
config = AutoConfig.from_pretrained(path, trust_remote_code=True)
|
13 |
+
# config.attn_config['attn_impl'] = 'triton'
|
14 |
config.init_device = 'cuda:0' # For fast initialization directly on GPU!
|
15 |
config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
|
16 |
|