oleksandrfluxon
/

mpt-7b-chat-4bit

Text Generation

text-generation-inference

Model card Files Files and versions

oleksandrfluxon commited on Jul 20, 2023

Commit

73a8df3

·

1 Parent(s): 2497126

Test max_memory for GPU

Files changed (1) hide show

pipeline.py +24 -3

pipeline.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import torch
 import transformers
 from typing import Dict, List, Any
 class PreTrainedPipeline():
@@ -21,17 +23,36 @@ class PreTrainedPipeline():
               path,
               config=config,
               # torch_dtype=torch.bfloat16, # Load model weights in bfloat16
               trust_remote_code=True
               # load_in_4bit=True, # Load model in the lowest 4-bit precision quantization
             )
-            model.to('cuda')
             print("===> model loaded")
             # removed device_map="auto"
             tokenizer = transformers.AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
-            self.pipeline = transformers.pipeline('text-generation', model=model, tokenizer=tokenizer, device='cuda:0')
             print("===> init finished")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:

 import torch
 import transformers
+from accelerate import dispatch_model, infer_auto_device_map
+from accelerate.utils import get_balanced_memory
 from typing import Dict, List, Any
 class PreTrainedPipeline():
               path,
               config=config,
               # torch_dtype=torch.bfloat16, # Load model weights in bfloat16
+              torch_dtype=torch.float16,
               trust_remote_code=True
               # load_in_4bit=True, # Load model in the lowest 4-bit precision quantization
             )
+            # model.to('cuda')
             print("===> model loaded")
             # removed device_map="auto"
             tokenizer = transformers.AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
+            max_memory = get_balanced_memory(
+                model,
+                max_memory=None,
+                no_split_module_classes=["MPTBlock"],
+                dtype='float16',
+                low_zero=False,
+            )
+            device_map = infer_auto_device_map(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=["MPTBlock"],
+                dtype='float16'
+            )
+            model = dispatch_model(model, device_map=device_map)
+            # device='cuda:0'
+            self.pipeline = transformers.pipeline('text-generation', model=model, tokenizer=tokenizer)
             print("===> init finished")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: