Upload mosaicml/mpt-7b-instruct ctranslate fp16 weights

Browse files

Files changed (5) hide show

README.md +61 -34
config.json +55 -4
model.bin +2 -2
requirements.txt +2 -0
vocabulary.json +0 -0

README.md CHANGED Viewed

@@ -16,38 +16,40 @@ Speedup inference while reducing memory by 2x-4x using int8 inference in C++ on
 quantized version of [mosaicml/mpt-7b-instruct](https://huggingface.co/mosaicml/mpt-7b-instruct)
 ```bash
-pip install hf-hub-ctranslate2>=2.0.8 ctranslate2>=3.14.0
-```
-Converted on 2023-05-31 using
-```
-ct2-transformers-converter --model mosaicml/mpt-7b-instruct --output_dir /home/michael/tmp-ct2fast-mpt-7b-instruct --force --copy_files tokenizer.json README.md tokenizer_config.json generation_config.json special_tokens_map.json .gitattributes --quantization float16 --trust_remote_code
 ```
-Checkpoint compatible to [ctranslate2>=3.14.0](https://github.com/OpenNMT/CTranslate2) and [hf-hub-ctranslate2>=2.0.8](https://github.com/michaelfeil/hf-hub-ctranslate2)
-- `compute_type=int8_float16` for `device="cuda"`
-- `compute_type=int8`  for `device="cpu"`
 ```python
-from hf_hub_ctranslate2 import TranslatorCT2fromHfHub, GeneratorCT2fromHfHub
-from transformers import AutoTokenizer
 model_name = "michaelfeil/ct2fast-mpt-7b-instruct"
-# use either TranslatorCT2fromHfHub or GeneratorCT2fromHfHub here, depending on model.
 model = GeneratorCT2fromHfHub(
         # load in int8 on CUDA
-        model_name_or_path=model_name,
         device="cuda",
         compute_type="int8_float16",
-        # tokenizer=AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
 )
 outputs = model.generate(
-    text=["How do you call a fast Flan-ingo?", "User: How are you doing? Bot:"],
-    max_length=64,
     include_prompt_in_result=False
 )
 print(outputs)
 ```
 # Licence and other remarks:
 This is just a quantized version. Licence conditions are intended to be idential to original huggingface repo.
@@ -57,7 +59,7 @@ This is just a quantized version. Licence conditions are intended to be idential
 # MPT-7B-Instruct
 MPT-7B-Instruct is a model for short-form instruction following.
-It is built by finetuning [MPT-7B](https://huggingface.co/spaces/mosaicml/mpt-7b) on a [dataset](https://huggingface.co/datasets/sam-mosaic/dolly_hhrlhf) derived from the [Databricks Dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) and the [Anthropic Helpful and Harmless (HH-RLHF)](https://huggingface.co/datasets/Anthropic/hh-rlhf) datasets.
   * License: _CC-By-SA-3.0_
   * [Demo on Hugging Face Spaces](https://huggingface.co/spaces/mosaicml/mpt-7b-instruct)
@@ -100,37 +102,41 @@ model = transformers.AutoModelForCausalLM.from_pretrained(
   trust_remote_code=True
 )
 ```
-Note: This model requires that `trust_remote_code=True` be passed to the `from_pretrained` method.
 This is because we use a custom `MPT` model architecture that is not yet part of the Hugging Face `transformers` package.
 `MPT` includes options for many training efficiency features such as [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf), [ALiBi](https://arxiv.org/abs/2108.12409), [QK LayerNorm](https://arxiv.org/abs/2010.04245), and more.
-To use the optimized [triton implementation](https://github.com/openai/triton) of FlashAttention, you can load the model with `attn_impl='triton'` and move the model to `bfloat16`:
 ```python
-config = transformers.AutoConfig.from_pretrained(
-  'mosaicml/mpt-7b-instruct',
-  trust_remote_code=True
-)
 config.attn_config['attn_impl'] = 'triton'
 model = transformers.AutoModelForCausalLM.from_pretrained(
-  'mosaicml/mpt-7b-instruct',
   config=config,
-  torch_dtype=torch.bfloat16,
   trust_remote_code=True
 )
-model.to(device='cuda:0')
 ```
 Although the model was trained with a sequence length of 2048, ALiBi enables users to increase the maximum sequence length during finetuning and/or inference. For example:
 ```python
-config = transformers.AutoConfig.from_pretrained(
-  'mosaicml/mpt-7b-instruct',
-  trust_remote_code=True
-)
-config.update({"max_seq_len": 4096})
 model = transformers.AutoModelForCausalLM.from_pretrained(
-  'mosaicml/mpt-7b-instruct',
   config=config,
   trust_remote_code=True
 )
@@ -143,6 +149,22 @@ from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
 ```
 ### Formatting
 This model was trained on data formatted in the dolly-15k format:
@@ -193,6 +215,11 @@ For more details on the pretraining process, see [MPT-7B](https://huggingface.co
 The data was tokenized using the [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) tokenizer.
 ## Limitations and Biases
 _The following language is modified from [EleutherAI's GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b)_
@@ -227,4 +254,4 @@ Please cite this model using the following format:
     note      = {Accessed: 2023-03-28}, % change this date
     urldate   = {2023-03-28} % change this date
 }
-```

 quantized version of [mosaicml/mpt-7b-instruct](https://huggingface.co/mosaicml/mpt-7b-instruct)
 ```bash
+pip install hf-hub-ctranslate2>=2.12.0 ctranslate2>=3.16.0
 ```
 ```python
+# from transformers import AutoTokenizer
 model_name = "michaelfeil/ct2fast-mpt-7b-instruct"
+from hf_hub_ctranslate2 import GeneratorCT2fromHfHub
 model = GeneratorCT2fromHfHub(
         # load in int8 on CUDA
+        model_name_or_path=model_name,
         device="cuda",
         compute_type="int8_float16",
+        # tokenizer=AutoTokenizer.from_pretrained("{ORG}/{NAME}")
 )
 outputs = model.generate(
+    text=["def fibonnaci(", "User: How are you doing? Bot:"],
+    max_length=64,
     include_prompt_in_result=False
 )
 print(outputs)
 ```
+Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
+and [hf-hub-ctranslate2>=2.12.0](https://github.com/michaelfeil/hf-hub-ctranslate2)
+- `compute_type=int8_float16` for `device="cuda"`
+- `compute_type=int8`  for `device="cpu"`
+Converted on 2023-06-27 using
+```
+ct2-transformers-converter --model mosaicml/mpt-7b-instruct --output_dir ~/tmp-ct2fast-mpt-7b-instruct --force --copy_files tokenizer.json README.md tokenizer_config.json generation_config.json special_tokens_map.json requirements.txt .gitattributes --quantization int8_float16 --trust_remote_code
+```
 # Licence and other remarks:
 This is just a quantized version. Licence conditions are intended to be idential to original huggingface repo.
 # MPT-7B-Instruct
 MPT-7B-Instruct is a model for short-form instruction following.
+It is built by finetuning [MPT-7B](https://huggingface.co/mosaicml/mpt-7b) on a [dataset](https://huggingface.co/datasets/sam-mosaic/dolly_hhrlhf) derived from the [Databricks Dolly-15k](https://huggingface.co/datasets/databricks/databricks-dolly-15k) and the [Anthropic Helpful and Harmless (HH-RLHF)](https://huggingface.co/datasets/Anthropic/hh-rlhf) datasets.
   * License: _CC-By-SA-3.0_
   * [Demo on Hugging Face Spaces](https://huggingface.co/spaces/mosaicml/mpt-7b-instruct)
   trust_remote_code=True
 )
 ```
+Note: This model requires that `trust_remote_code=True` be passed to the `from_pretrained` method.
 This is because we use a custom `MPT` model architecture that is not yet part of the Hugging Face `transformers` package.
 `MPT` includes options for many training efficiency features such as [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf), [ALiBi](https://arxiv.org/abs/2108.12409), [QK LayerNorm](https://arxiv.org/abs/2010.04245), and more.
+To use the optimized [triton implementation](https://github.com/openai/triton) of FlashAttention, you can load the model on GPU (`cuda:0`) with `attn_impl='triton'` and with `bfloat16` precision:
 ```python
+import torch
+import transformers
+name = 'mosaicml/mpt-7b-instruct'
+config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
 config.attn_config['attn_impl'] = 'triton'
+config.init_device = 'cuda:0' # For fast initialization directly on GPU!
 model = transformers.AutoModelForCausalLM.from_pretrained(
+  name,
   config=config,
+  torch_dtype=torch.bfloat16, # Load model weights in bfloat16
   trust_remote_code=True
 )
 ```
 Although the model was trained with a sequence length of 2048, ALiBi enables users to increase the maximum sequence length during finetuning and/or inference. For example:
 ```python
+import transformers
+name = 'mosaicml/mpt-7b-instruct'
+config = transformers.AutoConfig.from_pretrained(name, trust_remote_code=True)
+config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
 model = transformers.AutoModelForCausalLM.from_pretrained(
+  name,
   config=config,
   trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
 ```
+The model can then be used, for example, within a text-generation pipeline.
+Note: when running Torch modules in lower precision, it is best practice to use the [torch.autocast context manager](https://pytorch.org/docs/stable/amp.html).
+```python
+from transformers import pipeline
+pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device='cuda:0')
+with torch.autocast('cuda', dtype=torch.bfloat16):
+    print(
+        pipe('Here is a recipe for vegan banana bread:\n',
+            max_new_tokens=100,
+            do_sample=True,
+            use_cache=True))
+```
 ### Formatting
 This model was trained on data formatted in the dolly-15k format:
 The data was tokenized using the [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) tokenizer.
+### Training Configuration
+This model was trained on 8 A100-40GBs for about 2.3 hours using the [MosaicML Platform](https://www.mosaicml.com/platform).
+The model was trained with sharded data parallelism using [FSDP](https://pytorch.org/docs/stable/fsdp.html) and used the AdamW optimizer.
 ## Limitations and Biases
 _The following language is modified from [EleutherAI's GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b)_
     note      = {Accessed: 2023-03-28}, % change this date
     urldate   = {2023-03-28} % change this date
 }
+```

config.json CHANGED Viewed

@@ -1,5 +1,56 @@
 {
-  "bos_token": "<|endoftext|>",
-  "eos_token": "<|endoftext|>",
-  "unk_token": "<|endoftext|>"
-}

 {
+    "architectures": [
+        "MPTForCausalLM"
+    ],
+    "attn_config": {
+        "alibi": true,
+        "alibi_bias_max": 8,
+        "attn_impl": "torch",
+        "attn_pdrop": 0,
+        "attn_type": "multihead_attention",
+        "attn_uses_sequence_id": false,
+        "clip_qkv": null,
+        "prefix_lm": false,
+        "qk_ln": false,
+        "softmax_scale": null
+    },
+    "auto_map": {
+        "AutoConfig": "configuration_mpt.MPTConfig",
+        "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
+    },
+    "d_model": 4096,
+    "emb_pdrop": 0,
+    "embedding_fraction": 1.0,
+    "expansion_ratio": 4,
+    "init_config": {
+        "emb_init_std": null,
+        "emb_init_uniform_lim": null,
+        "fan_mode": "fan_in",
+        "init_div_is_residual": true,
+        "init_gain": 0,
+        "init_nonlinearity": "relu",
+        "init_std": 0.02,
+        "name": "kaiming_normal_",
+        "verbose": 0
+    },
+    "init_device": "cpu",
+    "learned_pos_emb": true,
+    "logit_scale": null,
+    "max_seq_len": 2048,
+    "model_type": "mpt",
+    "n_heads": 32,
+    "n_layers": 32,
+    "no_bias": true,
+    "norm_type": "low_precision_layernorm",
+    "resid_pdrop": 0,
+    "tokenizer_name": "EleutherAI/gpt-neox-20b",
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.28.1",
+    "use_cache": false,
+    "verbose": 0,
+    "vocab_size": 50432,
+    "bos_token": "<|endoftext|>",
+    "eos_token": "<|endoftext|>",
+    "layer_norm_epsilon": null,
+    "unk_token": "<|endoftext|>"
+}

model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:411576f8c03aa73bc7faa2d241ef5090e16abc73583c610f592dd36798c4b198
-size 13298599938

 version https://git-lfs.github.com/spec/v1
+oid sha256:1adb227bbf42f844b27c853a902aa384a770b246c764ce45b4ac836f9cdc9884
+size 6654505904

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ einops==0.5.0
2	+ triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python

vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff