not run
import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator
model_id = 'pigas/opt-125m-gptqg-2-bit-interpol-HQQ'
model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)
patch_linearlayers(model, patch_add_quant_config,
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))
model.eval();
cleanup()
HQQLinear.set_backend(HQQBackend.PYTORCH)
prepare_for_inference(model, backend="bitblas", allow_merge=False)
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
gen.warmup()
gen.generate("Write an essay about large language models", print_tokens=True, device=device)
Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
Warning: failed to import the BitBlas backend. Check if BitBlas is correctly installed if you want to use the bitblas backend (https://github.com/microsoft/BitBLAS).
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetchingβ4βfiles:β100%
β4/4β[00:09<00:00,ββ3.34s/it]
.gitattributes:β100%
β1.52k/1.52kβ[00:00<00:00,β79.1kB/s]
config.json:β100%
β1.07k/1.07kβ[00:00<00:00,β47.5kB/s]
quantize_config.json:β100%
β268/268β[00:00<00:00,β18.7kB/s]
model.safetensors:β100%
β180M/180Mβ[00:08<00:00,β22.0MB/s]
Exception Traceback (most recent call last)
in <cell line: 0>()
6
7 model_id = 'pigas/opt-125m-gptqg-2-bit-interpol-HQQ'
----> 8 model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
9 tokenizer = AutoTokenizer.from_pretrained(model_id)
10 patch_linearlayers(model, patch_add_quant_config,
1 frames
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py in try_snapshot_download(cls, save_dir_or_hub, cache_dir)
443 # Check
444 if not os.path.exists(cls.get_weight_file(save_dir)):
--> 445 raise Exception("Weight file missing. Check your cache directory.")
446 if not os.path.exists(cls.get_config_file(save_dir)):
447 raise Exception("Config file missing. Check your cache directory.")
Exception: Weight file missing. Check your cache directory.