Upload folder using huggingface_hub

d1ceb73 verified 11 months ago

27.9 kB

	# coding=utf-8
	# Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991)
	# https://github.com/99991/pygguf
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf
	with extra methods beings exposed
	"""

	from array import array

	import numpy as np
	from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
	from tokenizers.models import BPE

	from .. import AddedToken
	from ..convert_slow_tokenizer import LlamaConverter, Qwen2Converter
	from ..utils import logging
	from ..utils.logging import tqdm


	logger = logging.get_logger(__name__)


	# Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
	GGML_TYPES = {
	"F32": 0,
	"F16": 1,
	"Q4_0": 2,
	"Q8_0": 8,
	"Q2_K": 10,
	"Q3_K": 11,
	"Q4_K": 12,
	"Q5_K": 13,
	"Q6_K": 14,
	}

	# The Blocksizes are reported in bytes
	# Check out: https://github.com/ggerganov/llama.cpp/blob/8a56075b07a8b571bf95a912ffdce4c928c2b414/gguf-py/gguf/constants.py#L801
	GGML_BLOCK_SIZES = {
	"Q8_0": 2 + 32, # Q8_0 uses a blocksize of 32 (int8 tensors) + 2 bytes allocated for the scales
	"Q4_K": 144,
	# Q4_0 uses a blocksize of 32 but the 4-bit tensors are packed into 8-bit tensors + 2 bytes for the scales
	"Q4_0": 2 + 16,
	"Q6_K": 210,
	# See: https://github.com/99991/pygguf/commit/a417edbfc029a1bc270f984a694f9128c5afa8b9
	"Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
	"Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
	"Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
	}

	# Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
	DATA_TYPES = {
	"uint32": 4,
	"int32": 5,
	"float32": 6,
	"bool": 7,
	"string": 8,
	"array": 9,
	"uint64": 10,
	}

	GGUF_TENSOR_MAPPING = {
	"llama": {
	"token_embd": "model.embed_tokens",
	"blk": "model.layers",
	"ffn_up": "mlp.up_proj",
	"ffn_down": "mlp.down_proj",
	"ffn_gate": "mlp.gate_proj",
	"ffn_norm": "post_attention_layernorm",
	"attn_norm": "input_layernorm",
	"attn_q": "self_attn.q_proj",
	"attn_v": "self_attn.v_proj",
	"attn_k": "self_attn.k_proj",
	"attn_output": "self_attn.o_proj",
	"output.weight": "lm_head.weight",
	"output_norm": "model.norm",
	},
	"mistral": {
	"token_embd": "model.embed_tokens",
	"blk": "model.layers",
	"ffn_up": "mlp.up_proj",
	"ffn_down": "mlp.down_proj",
	"ffn_gate": "mlp.gate_proj",
	"ffn_norm": "post_attention_layernorm",
	"attn_norm": "input_layernorm",
	"attn_q": "self_attn.q_proj",
	"attn_v": "self_attn.v_proj",
	"attn_k": "self_attn.k_proj",
	"attn_output": "self_attn.o_proj",
	"output.weight": "lm_head.weight",
	"output_norm": "model.norm",
	},
	"qwen2": {
	"token_embd": "model.embed_tokens",
	"blk": "model.layers",
	"ffn_up": "mlp.up_proj",
	"ffn_down": "mlp.down_proj",
	"ffn_gate": "mlp.gate_proj",
	"ffn_norm": "post_attention_layernorm",
	"attn_norm": "input_layernorm",
	"attn_q": "self_attn.q_proj",
	"attn_v": "self_attn.v_proj",
	"attn_k": "self_attn.k_proj",
	"attn_output": "self_attn.o_proj",
	"output.weight": "lm_head.weight",
	"output_norm": "model.norm",
	},
	}


	GGUF_CONFIG_MAPPING = {
	"general": {
	"architecture": "model_type",
	"name": "_model_name_or_path",
	},
	"llama": {
	"context_length": "max_position_embeddings",
	"block_count": "num_hidden_layers",
	"feed_forward_length": "intermediate_size",
	"embedding_length": "hidden_size",
	"rope.dimension_count": None,
	"rope.freq_base": "rope_theta",
	"attention.head_count": "num_attention_heads",
	"attention.head_count_kv": "num_key_value_heads",
	"attention.layer_norm_rms_epsilon": "rms_norm_eps",
	"vocab_size": "vocab_size",
	},
	"mistral": {
	"context_length": "max_position_embeddings",
	"block_count": "num_hidden_layers",
	"feed_forward_length": "intermediate_size",
	"embedding_length": "hidden_size",
	"rope.dimension_count": None,
	"rope.freq_base": "rope_theta",
	"attention.head_count": "num_attention_heads",
	"attention.head_count_kv": "num_key_value_heads",
	"attention.layer_norm_rms_epsilon": "rms_norm_eps",
	"vocab_size": "vocab_size",
	},
	"qwen2": {
	"context_length": "max_position_embeddings",
	"block_count": "num_hidden_layers",
	"feed_forward_length": "intermediate_size",
	"embedding_length": "hidden_size",
	"rope.dimension_count": None,
	"rope.freq_base": "rope_theta",
	"attention.head_count": "num_attention_heads",
	"attention.head_count_kv": "num_key_value_heads",
	"attention.layer_norm_rms_epsilon": "rms_norm_eps",
	"vocab_size": "vocab_size",
	},
	"tokenizer": {
	"ggml.bos_token_id": "bos_token_id",
	"ggml.eos_token_id": "eos_token_id",
	"ggml.unknown_token_id": "unk_token_id",
	"ggml.padding_token_id": "pad_token_id",
	},
	}

	GGUF_TOKENIZER_MAPPING = {
	"tokenizer": {
	"ggml.model": "tokenizer_type",
	"ggml.tokens": "tokens",
	"ggml.scores": "scores",
	"ggml.token_type": "token_type",
	"ggml.merges": "merges",
	"ggml.bos_token_id": "bos_token_id",
	"ggml.eos_token_id": "eos_token_id",
	"ggml.unknown_token_id": "unk_token_id",
	"ggml.padding_token_id": "pad_token_id",
	"ggml.add_space_prefix": "add_prefix_space",
	},
	"tokenizer_config": {
	"chat_template": "chat_template",
	"ggml.model": "model_type",
	"ggml.bos_token_id": "bos_token_id",
	"ggml.eos_token_id": "eos_token_id",
	"ggml.unknown_token_id": "unk_token_id",
	"ggml.padding_token_id": "pad_token_id",
	},
	}


	def _gguf_parse_value(_value, data_type):
	if not isinstance(data_type, list):
	data_type = [data_type]
	if len(data_type) == 1:
	data_type = data_type[0]
	array_data_type = None
	else:
	if data_type[0] != 9:
	raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
	data_type, array_data_type = data_type

	if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
	_value = int(_value[0])
	elif data_type in [6, 12]:
	_value = float(_value[0])
	elif data_type in [7]:
	_value = bool(_value[0])
	elif data_type in [8]:
	_value = array("B", list(_value)).tobytes().decode()
	elif data_type in [9]:
	_value = _gguf_parse_value(_value, array_data_type)
	return _value


	def dequantize_q4_k(data, n_bytes: int):
	# C implementation
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
	block_size = GGML_BLOCK_SIZES["Q4_K"]
	num_blocks = n_bytes // block_size

	data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
	data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

	# Casting to float32 because float16 is very slow on CPU
	scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
	scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
	qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
	qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)

	# Dequantize scales and offsets (6 bits and 4 + 2 bits)
	factors = scale_factors * np.concatenate(
	[qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) \| ((qs1[:, 0:4] >> 6) << 4)], axis=1
	)
	offsets = scale_offsets * np.concatenate(
	[qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) \| ((qs1[:, 4:8] >> 6) << 4)], axis=1
	)

	# Interleave low and high quantized bits
	qs2 = np.stack([qs2 & 0xF, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
	# Dequantize final weights using scales and offsets
	return factors * qs2 - offsets


	def dequantize_q4_0(data, n_bytes: int):
	# C implementation
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1086
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L11
	block_size = GGML_BLOCK_SIZES["Q4_0"]
	num_blocks = n_bytes // block_size

	data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
	data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)

	# The scales are stored on the first 2 bytes and the rest corresponds to the quants
	scales = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)
	# scales = np.nan_to_num(scales)
	# the rest of the bytes corresponds to the quants - we discard the first two bytes
	quants = data_u8[:, 2:]

	ql = (quants[:, :] & 0xF).astype(np.int8) - 8
	qr = (quants[:, :] >> 4).astype(np.int8) - 8

	# Use hstack
	quants = np.hstack([ql, qr])

	return (scales * quants).astype(np.float32)


	def dequantize_q6_k(data, n_bytes: int):
	# C implementation
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152
	block_size = GGML_BLOCK_SIZES["Q6_K"]
	num_blocks = n_bytes // block_size

	data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
	data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
	data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size)

	scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32)

	# TODO use uint8 and cast later?
	ql = data_u8[:, :128].astype(np.int16)
	qh = data_u8[:, 128:192].astype(np.int16)
	sc = data_i8[:, 192:208, np.newaxis].astype(np.float32)

	# Unpack bits, subtraction requires signed data type
	q1 = (ql[:, :32] & 0xF) \| (((qh[:, :32] >> 0) & 3) << 4) - 32
	q2 = (ql[:, 32:64] & 0xF) \| (((qh[:, :32] >> 2) & 3) << 4) - 32
	q3 = (ql[:, :32] >> 4) \| (((qh[:, :32] >> 4) & 3) << 4) - 32
	q4 = (ql[:, 32:64] >> 4) \| (((qh[:, :32] >> 6) & 3) << 4) - 32
	q5 = (ql[:, 64:96] & 0xF) \| (((qh[:, 32:] >> 0) & 3) << 4) - 32
	q6 = (ql[:, 96:128] & 0xF) \| (((qh[:, 32:] >> 2) & 3) << 4) - 32
	q7 = (ql[:, 64:96] >> 4) \| (((qh[:, 32:] >> 4) & 3) << 4) - 32
	q8 = (ql[:, 96:128] >> 4) \| (((qh[:, 32:] >> 6) & 3) << 4) - 32

	# Dequantize
	return scales * np.concatenate(
	[
	sc[:, 0] * q1[:, :16],
	sc[:, 1] * q1[:, 16:],
	sc[:, 2] * q2[:, :16],
	sc[:, 3] * q2[:, 16:],
	sc[:, 4] * q3[:, :16],
	sc[:, 5] * q3[:, 16:],
	sc[:, 6] * q4[:, :16],
	sc[:, 7] * q4[:, 16:],
	sc[:, 8] * q5[:, :16],
	sc[:, 9] * q5[:, 16:],
	sc[:, 10] * q6[:, :16],
	sc[:, 11] * q6[:, 16:],
	sc[:, 12] * q7[:, :16],
	sc[:, 13] * q7[:, 16:],
	sc[:, 14] * q8[:, :16],
	sc[:, 15] * q8[:, 16:],
	],
	axis=1,
	)


	def dequantize_q8_0(data, n_bytes: int):
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
	block_size = GGML_BLOCK_SIZES["Q8_0"]
	num_blocks = n_bytes // block_size

	scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32)
	qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]

	return scales * qs


	def dequantize_q2_k(data, n_bytes: int):
	# C implementation
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74
	num_blocks = n_bytes // GGML_BLOCK_SIZES["Q2_K"]

	data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2)
	data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"])

	dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
	d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32)
	scales = data_u8[:, :16].reshape(num_blocks, 16, 1)
	qs = data_u8[:, 16:80].reshape(num_blocks, 64)

	tmp = np.stack(
	[
	qs[:, 00:16] >> 0,
	qs[:, 16:32] >> 0,
	qs[:, 00:16] >> 2,
	qs[:, 16:32] >> 2,
	qs[:, 00:16] >> 4,
	qs[:, 16:32] >> 4,
	qs[:, 00:16] >> 6,
	qs[:, 16:32] >> 6,
	qs[:, 32:48] >> 0,
	qs[:, 48:64] >> 0,
	qs[:, 32:48] >> 2,
	qs[:, 48:64] >> 2,
	qs[:, 32:48] >> 4,
	qs[:, 48:64] >> 4,
	qs[:, 32:48] >> 6,
	qs[:, 48:64] >> 6,
	],
	axis=1,
	)

	return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)


	def dequantize_q3_k(data, n_bytes: int):
	# C implementation
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95
	num_blocks = n_bytes // GGML_BLOCK_SIZES["Q3_K"]

	data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2)
	data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"])

	d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
	bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little")
	bits = 4 ^ (bits << 2)
	qs = data_u8[:, 32 : 32 + 64].astype(np.int16)
	a, b, c = data_u8[:, 96 : 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2)
	scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8)
	scales[:, 0] = (a & 15) \| ((c & 3) << 4)
	scales[:, 1] = (b & 15) \| (((c >> 2) & 3) << 4)
	scales[:, 2] = (a >> 4) \| (((c >> 4) & 3) << 4)
	scales[:, 3] = (b >> 4) \| ((c >> 6) << 4)
	scales = scales.reshape(num_blocks, 16, 1).astype(np.int16)

	return (
	d
	* (scales - 32)
	* np.stack(
	[
	(((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]),
	(((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]),
	(((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]),
	(((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]),
	(((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]),
	(((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]),
	(((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]),
	(((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]),
	(((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]),
	(((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]),
	(((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]),
	(((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]),
	(((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]),
	(((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]),
	(((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]),
	(((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7]),
	],
	axis=1,
	)
	)


	def dequantize_q5_k(data, n_bytes: int):
	# C implementation
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129
	# C struct definition
	# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138
	num_blocks = n_bytes // GGML_BLOCK_SIZES["Q5_K"]

	data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2)
	data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"])

	d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)
	dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32)
	scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
	qh = data_u8[:, 16 : 16 + 32].reshape(num_blocks, 32, 1)
	qs = data_u8[:, 48 : 48 + 128].reshape(num_blocks, 4, 32)

	bits = np.unpackbits(qh, axis=-1, bitorder="little")

	qs_hi_4 = qs >> 4
	qs_lo_4 = qs & 15

	scales_lo_6 = scales[:, :8] & 63
	scales_hi_6 = scales[:, :8] >> 6
	scales_lo_4 = scales[:, 8:] & 15
	scales_hi_4 = scales[:, 8:] >> 4

	m1 = dmin * scales_lo_6[:, 4]
	m2 = dmin * scales_lo_6[:, 5]
	m3 = dmin * scales_lo_6[:, 6]
	m4 = dmin * scales_lo_6[:, 7]
	m5 = dmin * (scales_hi_4[:, 0] \| (scales_hi_6[:, 4] << 4))
	m6 = dmin * (scales_hi_4[:, 1] \| (scales_hi_6[:, 5] << 4))
	m7 = dmin * (scales_hi_4[:, 2] \| (scales_hi_6[:, 6] << 4))
	m8 = dmin * (scales_hi_4[:, 3] \| (scales_hi_6[:, 7] << 4))

	d1 = d * scales_lo_6[:, 0]
	d2 = d * scales_lo_6[:, 1]
	d3 = d * scales_lo_6[:, 2]
	d4 = d * scales_lo_6[:, 3]
	d5 = d * (scales_lo_4[:, 0] \| (scales_hi_6[:, 0] << 4))
	d6 = d * (scales_lo_4[:, 1] \| (scales_hi_6[:, 1] << 4))
	d7 = d * (scales_lo_4[:, 2] \| (scales_hi_6[:, 2] << 4))
	d8 = d * (scales_lo_4[:, 3] \| (scales_hi_6[:, 3] << 4))

	return np.concatenate(
	[
	d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1,
	d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2,
	d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3,
	d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4,
	d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5,
	d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6,
	d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7,
	d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
	],
	axis=1,
	)


	def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes):
	if ggml_type == GGML_TYPES["F32"]:
	values = data
	elif ggml_type == GGML_TYPES["F16"]:
	values = data
	elif ggml_type == GGML_TYPES["Q8_0"]:
	values = dequantize_q8_0(data, n_bytes)
	elif ggml_type == GGML_TYPES["Q4_0"]:
	values = dequantize_q4_0(data, n_bytes)
	elif ggml_type == GGML_TYPES["Q4_K"]:
	values = dequantize_q4_k(data, n_bytes)
	elif ggml_type == GGML_TYPES["Q6_K"]:
	values = dequantize_q6_k(data, n_bytes)
	elif ggml_type == GGML_TYPES["Q2_K"]:
	values = dequantize_q2_k(data, n_bytes)
	elif ggml_type == GGML_TYPES["Q3_K"]:
	values = dequantize_q3_k(data, n_bytes)
	elif ggml_type == GGML_TYPES["Q5_K"]:
	values = dequantize_q5_k(data, n_bytes)
	else:
	raise NotImplementedError(
	f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose"
	)

	return values.reshape(shape[::-1])


	class GGUFTokenizerSkeleton:
	def __init__(self, dict_):
	for k, v in dict_.items():
	setattr(self, k, v)

	if not hasattr(self, "merges"):
	if not hasattr(self, "tokens") or not hasattr(self, "scores"):
	raise ValueError(
	"tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated."
	)
	tokens = self.tokens
	scores = self.scores
	vocab = {t: scores[i] for i, t in enumerate(tokens)}

	logger.warning("Merges were not in checkpoint, building merges on the fly.")
	merges = []
	for merge, piece_score in tqdm(vocab.items()):
	local = []
	for index in range(1, len(merge)):
	piece_l, piece_r = merge[:index], merge[index:]
	if piece_l in tokens and piece_r in tokens:
	local.append((piece_l, piece_r, piece_score))
	local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]), reverse=True)
	merges.extend(local)
	merges = sorted(merges, key=lambda val: val[2], reverse=True)
	merges = [(val[0], val[1]) for val in merges]
	self.merges = merges
	else:
	self.merges = [tuple(merge.split(" ")) for merge in self.merges]
	if not hasattr(self, "scores"):
	self.scores = [None for _ in range(len(self.tokens))]

	if not hasattr(self, "added_tokens"):
	self.added_tokens = []

	if not hasattr(self, "unk_token_id"):
	self.unk_token_id = None

	# Llama2 uses the field `unknown_token_id`
	if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
	self.unk_token_id = self.unknown_token_id


	class GGUFLlamaConverter(LlamaConverter):
	def __init__(self, tokenizer_dict):
	self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
	self.original_tokenizer = self.proto
	self.additional_kwargs = {}
	self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"

	def vocab(self, proto):
	return list(zip(proto.tokens, proto.scores))

	def merges(self, proto):
	return proto.merges

	def tokenizer(self, proto):
	vocab_scores = self.vocab(self.proto)
	merges = self.merges(self.proto)
	bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}

	unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
	bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
	eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None

	tokenizer = Tokenizer(BPE(bpe_vocab, merges, unk_token=unk_token, fuse_unk=True, byte_fallback=True))

	special_tokens = []

	if not hasattr(self.proto, "token_type"):
	if unk_token is not None:
	special_tokens.append(AddedToken(unk_token, normalized=False, special=True))

	if bos_token is not None:
	special_tokens.append(AddedToken(bos_token, normalized=False, special=True))

	if eos_token is not None:
	special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
	else:
	# 3 stands for special tokens
	special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]

	for idx in special_tokens_idx:
	special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))

	if len(special_tokens) != 0:
	tokenizer.add_special_tokens(special_tokens)

	if len(self.proto.added_tokens) != 0:
	tokenizer.add_tokens(
	[AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
	)

	self.additional_kwargs["unk_token"] = unk_token
	self.additional_kwargs["eos_token"] = bos_token
	self.additional_kwargs["bos_token"] = eos_token

	if self.is_llama_3_tokenizer:
	self.additional_kwargs["add_prefix_space"] = None
	self.additional_kwargs["clean_up_tokenization_spaces"] = True

	self.additional_kwargs["legacy"] = False
	self.original_tokenizer.legacy = False

	return tokenizer

	def decoder(self, replacement, add_prefix_space):
	sequence = [
	decoders.ByteFallback(),
	decoders.Fuse(),
	decoders.Replace("▁", " "),
	]

	if self.is_llama_3_tokenizer:
	sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]

	if add_prefix_space:
	sequence += [decoders.Strip(content=" ", left=1)]
	return decoders.Sequence(sequence)

	def converted(self):
	# Copied partly from converted method in SpmConverter class
	tokenizer = self.tokenizer(self.proto)

	# Tokenizer assemble
	normalizer = self.normalizer(self.proto)
	if normalizer is not None:
	tokenizer.normalizer = normalizer

	replacement = "▁"
	add_prefix_space = True
	if hasattr(self.original_tokenizer, "add_prefix_space"):
	add_prefix_space = self.original_tokenizer.add_prefix_space

	pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
	if pre_tokenizer is not None:
	tokenizer.pre_tokenizer = pre_tokenizer

	tokenizer.decoder = self.decoder(replacement, add_prefix_space)
	post_processor = self.post_processor()
	if post_processor:
	tokenizer.post_processor = post_processor

	# HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
	# and normalizer
	if self.is_llama_3_tokenizer:
	tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
	add_prefix_space=False, trim_offsets=False, use_regex=True
	)
	# This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
	# init.
	tokenizer.normalizer = normalizers.Sequence([])

	return tokenizer


	class GGUFQwen2Converter(Qwen2Converter):
	def __init__(self, tokenizer_dict):
	self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
	self.additional_kwargs = {}

	def converted(self) -> Tokenizer:
	vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
	merges = self.original_tokenizer.merges
	tokenizer = super().converted(vocab, merges)

	tokenizer.add_special_tokens(
	[
	AddedToken("<\|endoftext\|>", normalized=False, special=True),
	AddedToken("<\|im_start\|>", normalized=False, special=True),
	AddedToken("<\|im_end\|>", normalized=False, special=True),
	]
	)
	return tokenizer


	GGUF_TO_FAST_CONVERTERS = {
	"llama": GGUFLlamaConverter,
	"qwen2": GGUFQwen2Converter,
	}


	def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer:
	"""
	Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

	Args:
	architecture (`str`): The model architecture derived from gguf file.
	transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
	Instance of a slow tokenizer to convert in the backend tokenizer for
	[`~tokenization_utils_base.PreTrainedTokenizerFast`].

	Return:
	A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
	[`~tokenization_utils_base.PreTrainedTokenizerFast`]
	"""
	tokenizer_class_name = architecture
	converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
	fast_tokenizer = converter.converted()
	return fast_tokenizer, converter.additional_kwargs