pigas/opt-125m-gptqg-2-bit-interpol-HQQ

import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

model_id = 'pigas/opt-125m-gptqg-2-bit-interpol-HQQ'
model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)
patch_linearlayers(model, patch_add_quant_config,
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))
model.eval();
cleanup()
HQQLinear.set_backend(HQQBackend.PYTORCH)
prepare_for_inference(model, backend="bitblas", allow_merge=False)
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
gen.warmup()
gen.generate("Write an essay about large language models", print_tokens=True, device=device)

Warning: failed to import the Marlin backend. Check if marlin is correctly installed if you want to use the Marlin backend (https://github.com/IST-DASLab/marlin).
Warning: failed to import the BitBlas backend. Check if BitBlas is correctly installed if you want to use the bitblas backend (https://github.com/microsoft/BitBLAS).
/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 4 files: 100%
4/4 [00:09<00:00, 3.34s/it]
.gitattributes: 100%
1.52k/1.52k [00:00<00:00, 79.1kB/s]
config.json: 100%
1.07k/1.07k [00:00<00:00, 47.5kB/s]
quantize_config.json: 100%
268/268 [00:00<00:00, 18.7kB/s]
model.safetensors: 100%
180M/180M [00:08<00:00, 22.0MB/s]

Exception Traceback (most recent call last)
in <cell line: 0>()
6
7 model_id = 'pigas/opt-125m-gptqg-2-bit-interpol-HQQ'
----> 8 model = HQQModelForCausalLM.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
9 tokenizer = AutoTokenizer.from_pretrained(model_id)
10 patch_linearlayers(model, patch_add_quant_config,

1 frames
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py in try_snapshot_download(cls, save_dir_or_hub, cache_dir)
443 # Check
444 if not os.path.exists(cls.get_weight_file(save_dir)):
--> 445 raise Exception("Weight file missing. Check your cache directory.")
446 if not os.path.exists(cls.get_config_file(save_dir)):
447 raise Exception("Config file missing. Check your cache directory.")

Exception: Weight file missing. Check your cache directory.

pigas
/

opt-125m-gptqg-2-bit-interpol-HQQ

not run