oleksandrfluxon commited on
Commit
76a36d6
1 Parent(s): 37b0e2d

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +14 -3
handler.py CHANGED
@@ -7,11 +7,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
7
  class EndpointHandler:
8
  def __init__(self, path=""):
9
  # load model and tokenizer from path
10
- self.tokenizer = AutoTokenizer.from_pretrained(path)
 
 
 
 
 
 
11
  self.model = AutoModelForCausalLM.from_pretrained(
12
- path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
 
 
 
13
  )
14
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
15
 
16
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
17
  # process input
 
7
  class EndpointHandler:
8
  def __init__(self, path=""):
9
  # load model and tokenizer from path
10
+ self.tokenizer = AutoTokenizer.from_pretrained(path) # AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
11
+
12
+ config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
13
+ config.attn_config['attn_impl'] = 'triton'
14
+ config.init_device = 'cuda:0' # For fast initialization directly on GPU!
15
+ config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
16
+
17
  self.model = AutoModelForCausalLM.from_pretrained(
18
+ path,
19
+ config,
20
+ torch_dtype=torch.float16,
21
+ trust_remote_code=True
22
  )
23
+ # self.device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ self.device = 'cuda'
25
+ print('===> device', device)
26
 
27
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
28
  # process input