oleksandrfluxon
/

mpt-7b-instruct

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

oleksandrfluxon commited on Jul 21, 2023

Commit

76a36d6

•

1 Parent(s): 37b0e2d

Update handler.py

Files changed (1) hide show

handler.py +14 -3

handler.py CHANGED Viewed

@@ -7,11 +7,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 class EndpointHandler:
     def __init__(self, path=""):
         # load model and tokenizer from path
-        self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.model = AutoModelForCausalLM.from_pretrained(
-            path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
         )
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         # process input

 class EndpointHandler:
     def __init__(self, path=""):
         # load model and tokenizer from path
+        self.tokenizer = AutoTokenizer.from_pretrained(path) # AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+        config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
+        config.attn_config['attn_impl'] = 'triton'
+        config.init_device = 'cuda:0' # For fast initialization directly on GPU!
+        config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
         self.model = AutoModelForCausalLM.from_pretrained(
+            path,
+            config,
+            torch_dtype=torch.float16,
+            trust_remote_code=True
         )
+        # self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = 'cuda'
+        print('===> device', device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         # process input