|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf |
|
with extra methods beings exposed |
|
""" |
|
|
|
from array import array |
|
|
|
import numpy as np |
|
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers |
|
from tokenizers.models import BPE |
|
|
|
from .. import AddedToken |
|
from ..convert_slow_tokenizer import LlamaConverter, Qwen2Converter |
|
from ..utils import logging |
|
from ..utils.logging import tqdm |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
|
|
GGML_TYPES = { |
|
"F32": 0, |
|
"F16": 1, |
|
"Q4_0": 2, |
|
"Q8_0": 8, |
|
"Q2_K": 10, |
|
"Q3_K": 11, |
|
"Q4_K": 12, |
|
"Q5_K": 13, |
|
"Q6_K": 14, |
|
} |
|
|
|
|
|
|
|
GGML_BLOCK_SIZES = { |
|
"Q8_0": 2 + 32, |
|
"Q4_K": 144, |
|
|
|
"Q4_0": 2 + 16, |
|
"Q6_K": 210, |
|
|
|
"Q2_K": 256 // 16 + 256 // 4 + 2 + 2, |
|
"Q3_K": 256 // 8 + 256 // 4 + 12 + 2, |
|
"Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2, |
|
} |
|
|
|
|
|
DATA_TYPES = { |
|
"uint32": 4, |
|
"int32": 5, |
|
"float32": 6, |
|
"bool": 7, |
|
"string": 8, |
|
"array": 9, |
|
"uint64": 10, |
|
} |
|
|
|
GGUF_TENSOR_MAPPING = { |
|
"llama": { |
|
"token_embd": "model.embed_tokens", |
|
"blk": "model.layers", |
|
"ffn_up": "mlp.up_proj", |
|
"ffn_down": "mlp.down_proj", |
|
"ffn_gate": "mlp.gate_proj", |
|
"ffn_norm": "post_attention_layernorm", |
|
"attn_norm": "input_layernorm", |
|
"attn_q": "self_attn.q_proj", |
|
"attn_v": "self_attn.v_proj", |
|
"attn_k": "self_attn.k_proj", |
|
"attn_output": "self_attn.o_proj", |
|
"output.weight": "lm_head.weight", |
|
"output_norm": "model.norm", |
|
}, |
|
"mistral": { |
|
"token_embd": "model.embed_tokens", |
|
"blk": "model.layers", |
|
"ffn_up": "mlp.up_proj", |
|
"ffn_down": "mlp.down_proj", |
|
"ffn_gate": "mlp.gate_proj", |
|
"ffn_norm": "post_attention_layernorm", |
|
"attn_norm": "input_layernorm", |
|
"attn_q": "self_attn.q_proj", |
|
"attn_v": "self_attn.v_proj", |
|
"attn_k": "self_attn.k_proj", |
|
"attn_output": "self_attn.o_proj", |
|
"output.weight": "lm_head.weight", |
|
"output_norm": "model.norm", |
|
}, |
|
"qwen2": { |
|
"token_embd": "model.embed_tokens", |
|
"blk": "model.layers", |
|
"ffn_up": "mlp.up_proj", |
|
"ffn_down": "mlp.down_proj", |
|
"ffn_gate": "mlp.gate_proj", |
|
"ffn_norm": "post_attention_layernorm", |
|
"attn_norm": "input_layernorm", |
|
"attn_q": "self_attn.q_proj", |
|
"attn_v": "self_attn.v_proj", |
|
"attn_k": "self_attn.k_proj", |
|
"attn_output": "self_attn.o_proj", |
|
"output.weight": "lm_head.weight", |
|
"output_norm": "model.norm", |
|
}, |
|
} |
|
|
|
|
|
GGUF_CONFIG_MAPPING = { |
|
"general": { |
|
"architecture": "model_type", |
|
"name": "_model_name_or_path", |
|
}, |
|
"llama": { |
|
"context_length": "max_position_embeddings", |
|
"block_count": "num_hidden_layers", |
|
"feed_forward_length": "intermediate_size", |
|
"embedding_length": "hidden_size", |
|
"rope.dimension_count": None, |
|
"rope.freq_base": "rope_theta", |
|
"attention.head_count": "num_attention_heads", |
|
"attention.head_count_kv": "num_key_value_heads", |
|
"attention.layer_norm_rms_epsilon": "rms_norm_eps", |
|
"vocab_size": "vocab_size", |
|
}, |
|
"mistral": { |
|
"context_length": "max_position_embeddings", |
|
"block_count": "num_hidden_layers", |
|
"feed_forward_length": "intermediate_size", |
|
"embedding_length": "hidden_size", |
|
"rope.dimension_count": None, |
|
"rope.freq_base": "rope_theta", |
|
"attention.head_count": "num_attention_heads", |
|
"attention.head_count_kv": "num_key_value_heads", |
|
"attention.layer_norm_rms_epsilon": "rms_norm_eps", |
|
"vocab_size": "vocab_size", |
|
}, |
|
"qwen2": { |
|
"context_length": "max_position_embeddings", |
|
"block_count": "num_hidden_layers", |
|
"feed_forward_length": "intermediate_size", |
|
"embedding_length": "hidden_size", |
|
"rope.dimension_count": None, |
|
"rope.freq_base": "rope_theta", |
|
"attention.head_count": "num_attention_heads", |
|
"attention.head_count_kv": "num_key_value_heads", |
|
"attention.layer_norm_rms_epsilon": "rms_norm_eps", |
|
"vocab_size": "vocab_size", |
|
}, |
|
"tokenizer": { |
|
"ggml.bos_token_id": "bos_token_id", |
|
"ggml.eos_token_id": "eos_token_id", |
|
"ggml.unknown_token_id": "unk_token_id", |
|
"ggml.padding_token_id": "pad_token_id", |
|
}, |
|
} |
|
|
|
GGUF_TOKENIZER_MAPPING = { |
|
"tokenizer": { |
|
"ggml.model": "tokenizer_type", |
|
"ggml.tokens": "tokens", |
|
"ggml.scores": "scores", |
|
"ggml.token_type": "token_type", |
|
"ggml.merges": "merges", |
|
"ggml.bos_token_id": "bos_token_id", |
|
"ggml.eos_token_id": "eos_token_id", |
|
"ggml.unknown_token_id": "unk_token_id", |
|
"ggml.padding_token_id": "pad_token_id", |
|
"ggml.add_space_prefix": "add_prefix_space", |
|
}, |
|
"tokenizer_config": { |
|
"chat_template": "chat_template", |
|
"ggml.model": "model_type", |
|
"ggml.bos_token_id": "bos_token_id", |
|
"ggml.eos_token_id": "eos_token_id", |
|
"ggml.unknown_token_id": "unk_token_id", |
|
"ggml.padding_token_id": "pad_token_id", |
|
}, |
|
} |
|
|
|
|
|
def _gguf_parse_value(_value, data_type): |
|
if not isinstance(data_type, list): |
|
data_type = [data_type] |
|
if len(data_type) == 1: |
|
data_type = data_type[0] |
|
array_data_type = None |
|
else: |
|
if data_type[0] != 9: |
|
raise ValueError("Received multiple types, therefore expected the first type to indicate an array.") |
|
data_type, array_data_type = data_type |
|
|
|
if data_type in [0, 1, 2, 3, 4, 5, 10, 11]: |
|
_value = int(_value[0]) |
|
elif data_type in [6, 12]: |
|
_value = float(_value[0]) |
|
elif data_type in [7]: |
|
_value = bool(_value[0]) |
|
elif data_type in [8]: |
|
_value = array("B", list(_value)).tobytes().decode() |
|
elif data_type in [9]: |
|
_value = _gguf_parse_value(_value, array_data_type) |
|
return _value |
|
|
|
|
|
def dequantize_q4_k(data, n_bytes: int): |
|
|
|
|
|
|
|
|
|
block_size = GGML_BLOCK_SIZES["Q4_K"] |
|
num_blocks = n_bytes // block_size |
|
|
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) |
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) |
|
|
|
|
|
scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32) |
|
scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32) |
|
qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1) |
|
qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32) |
|
|
|
|
|
factors = scale_factors * np.concatenate( |
|
[qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1 |
|
) |
|
offsets = scale_offsets * np.concatenate( |
|
[qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1 |
|
) |
|
|
|
|
|
qs2 = np.stack([qs2 & 0xF, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32) |
|
|
|
return factors * qs2 - offsets |
|
|
|
|
|
def dequantize_q4_0(data, n_bytes: int): |
|
|
|
|
|
|
|
|
|
block_size = GGML_BLOCK_SIZES["Q4_0"] |
|
num_blocks = n_bytes // block_size |
|
|
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) |
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) |
|
|
|
|
|
scales = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) |
|
|
|
|
|
quants = data_u8[:, 2:] |
|
|
|
ql = (quants[:, :] & 0xF).astype(np.int8) - 8 |
|
qr = (quants[:, :] >> 4).astype(np.int8) - 8 |
|
|
|
|
|
quants = np.hstack([ql, qr]) |
|
|
|
return (scales * quants).astype(np.float32) |
|
|
|
|
|
def dequantize_q6_k(data, n_bytes: int): |
|
|
|
|
|
|
|
|
|
block_size = GGML_BLOCK_SIZES["Q6_K"] |
|
num_blocks = n_bytes // block_size |
|
|
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) |
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) |
|
data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size) |
|
|
|
scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32) |
|
|
|
|
|
ql = data_u8[:, :128].astype(np.int16) |
|
qh = data_u8[:, 128:192].astype(np.int16) |
|
sc = data_i8[:, 192:208, np.newaxis].astype(np.float32) |
|
|
|
|
|
q1 = (ql[:, :32] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32 |
|
q2 = (ql[:, 32:64] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32 |
|
q3 = (ql[:, :32] >> 4) | (((qh[:, :32] >> 4) & 3) << 4) - 32 |
|
q4 = (ql[:, 32:64] >> 4) | (((qh[:, :32] >> 6) & 3) << 4) - 32 |
|
q5 = (ql[:, 64:96] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32 |
|
q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32 |
|
q7 = (ql[:, 64:96] >> 4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32 |
|
q8 = (ql[:, 96:128] >> 4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32 |
|
|
|
|
|
return scales * np.concatenate( |
|
[ |
|
sc[:, 0] * q1[:, :16], |
|
sc[:, 1] * q1[:, 16:], |
|
sc[:, 2] * q2[:, :16], |
|
sc[:, 3] * q2[:, 16:], |
|
sc[:, 4] * q3[:, :16], |
|
sc[:, 5] * q3[:, 16:], |
|
sc[:, 6] * q4[:, :16], |
|
sc[:, 7] * q4[:, 16:], |
|
sc[:, 8] * q5[:, :16], |
|
sc[:, 9] * q5[:, 16:], |
|
sc[:, 10] * q6[:, :16], |
|
sc[:, 11] * q6[:, 16:], |
|
sc[:, 12] * q7[:, :16], |
|
sc[:, 13] * q7[:, 16:], |
|
sc[:, 14] * q8[:, :16], |
|
sc[:, 15] * q8[:, 16:], |
|
], |
|
axis=1, |
|
) |
|
|
|
|
|
def dequantize_q8_0(data, n_bytes: int): |
|
|
|
|
|
block_size = GGML_BLOCK_SIZES["Q8_0"] |
|
num_blocks = n_bytes // block_size |
|
|
|
scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32) |
|
qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:] |
|
|
|
return scales * qs |
|
|
|
|
|
def dequantize_q2_k(data, n_bytes: int): |
|
|
|
|
|
|
|
|
|
num_blocks = n_bytes // GGML_BLOCK_SIZES["Q2_K"] |
|
|
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2) |
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"]) |
|
|
|
dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) |
|
d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32) |
|
scales = data_u8[:, :16].reshape(num_blocks, 16, 1) |
|
qs = data_u8[:, 16:80].reshape(num_blocks, 64) |
|
|
|
tmp = np.stack( |
|
[ |
|
qs[:, 00:16] >> 0, |
|
qs[:, 16:32] >> 0, |
|
qs[:, 00:16] >> 2, |
|
qs[:, 16:32] >> 2, |
|
qs[:, 00:16] >> 4, |
|
qs[:, 16:32] >> 4, |
|
qs[:, 00:16] >> 6, |
|
qs[:, 16:32] >> 6, |
|
qs[:, 32:48] >> 0, |
|
qs[:, 48:64] >> 0, |
|
qs[:, 32:48] >> 2, |
|
qs[:, 48:64] >> 2, |
|
qs[:, 32:48] >> 4, |
|
qs[:, 48:64] >> 4, |
|
qs[:, 32:48] >> 6, |
|
qs[:, 48:64] >> 6, |
|
], |
|
axis=1, |
|
) |
|
|
|
return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4) |
|
|
|
|
|
def dequantize_q3_k(data, n_bytes: int): |
|
|
|
|
|
|
|
|
|
num_blocks = n_bytes // GGML_BLOCK_SIZES["Q3_K"] |
|
|
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2) |
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"]) |
|
|
|
d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) |
|
bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little") |
|
bits = 4 ^ (bits << 2) |
|
qs = data_u8[:, 32 : 32 + 64].astype(np.int16) |
|
a, b, c = data_u8[:, 96 : 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2) |
|
scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8) |
|
scales[:, 0] = (a & 15) | ((c & 3) << 4) |
|
scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4) |
|
scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4) |
|
scales[:, 3] = (b >> 4) | ((c >> 6) << 4) |
|
scales = scales.reshape(num_blocks, 16, 1).astype(np.int16) |
|
|
|
return ( |
|
d |
|
* (scales - 32) |
|
* np.stack( |
|
[ |
|
(((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]), |
|
(((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]), |
|
(((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]), |
|
(((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]), |
|
(((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]), |
|
(((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]), |
|
(((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]), |
|
(((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]), |
|
(((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]), |
|
(((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]), |
|
(((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]), |
|
(((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]), |
|
(((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]), |
|
(((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]), |
|
(((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]), |
|
(((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7]), |
|
], |
|
axis=1, |
|
) |
|
) |
|
|
|
|
|
def dequantize_q5_k(data, n_bytes: int): |
|
|
|
|
|
|
|
|
|
num_blocks = n_bytes // GGML_BLOCK_SIZES["Q5_K"] |
|
|
|
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2) |
|
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"]) |
|
|
|
d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) |
|
dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32) |
|
scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1) |
|
qh = data_u8[:, 16 : 16 + 32].reshape(num_blocks, 32, 1) |
|
qs = data_u8[:, 48 : 48 + 128].reshape(num_blocks, 4, 32) |
|
|
|
bits = np.unpackbits(qh, axis=-1, bitorder="little") |
|
|
|
qs_hi_4 = qs >> 4 |
|
qs_lo_4 = qs & 15 |
|
|
|
scales_lo_6 = scales[:, :8] & 63 |
|
scales_hi_6 = scales[:, :8] >> 6 |
|
scales_lo_4 = scales[:, 8:] & 15 |
|
scales_hi_4 = scales[:, 8:] >> 4 |
|
|
|
m1 = dmin * scales_lo_6[:, 4] |
|
m2 = dmin * scales_lo_6[:, 5] |
|
m3 = dmin * scales_lo_6[:, 6] |
|
m4 = dmin * scales_lo_6[:, 7] |
|
m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4)) |
|
m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4)) |
|
m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4)) |
|
m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4)) |
|
|
|
d1 = d * scales_lo_6[:, 0] |
|
d2 = d * scales_lo_6[:, 1] |
|
d3 = d * scales_lo_6[:, 2] |
|
d4 = d * scales_lo_6[:, 3] |
|
d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4)) |
|
d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4)) |
|
d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4)) |
|
d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4)) |
|
|
|
return np.concatenate( |
|
[ |
|
d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1, |
|
d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2, |
|
d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3, |
|
d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4, |
|
d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5, |
|
d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6, |
|
d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7, |
|
d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8, |
|
], |
|
axis=1, |
|
) |
|
|
|
|
|
def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes): |
|
if ggml_type == GGML_TYPES["F32"]: |
|
values = data |
|
elif ggml_type == GGML_TYPES["F16"]: |
|
values = data |
|
elif ggml_type == GGML_TYPES["Q8_0"]: |
|
values = dequantize_q8_0(data, n_bytes) |
|
elif ggml_type == GGML_TYPES["Q4_0"]: |
|
values = dequantize_q4_0(data, n_bytes) |
|
elif ggml_type == GGML_TYPES["Q4_K"]: |
|
values = dequantize_q4_k(data, n_bytes) |
|
elif ggml_type == GGML_TYPES["Q6_K"]: |
|
values = dequantize_q6_k(data, n_bytes) |
|
elif ggml_type == GGML_TYPES["Q2_K"]: |
|
values = dequantize_q2_k(data, n_bytes) |
|
elif ggml_type == GGML_TYPES["Q3_K"]: |
|
values = dequantize_q3_k(data, n_bytes) |
|
elif ggml_type == GGML_TYPES["Q5_K"]: |
|
values = dequantize_q5_k(data, n_bytes) |
|
else: |
|
raise NotImplementedError( |
|
f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose" |
|
) |
|
|
|
return values.reshape(shape[::-1]) |
|
|
|
|
|
class GGUFTokenizerSkeleton: |
|
def __init__(self, dict_): |
|
for k, v in dict_.items(): |
|
setattr(self, k, v) |
|
|
|
if not hasattr(self, "merges"): |
|
if not hasattr(self, "tokens") or not hasattr(self, "scores"): |
|
raise ValueError( |
|
"tokens and scores need to be passed for a LLaMa tokenizer without merges to be instantiated." |
|
) |
|
tokens = self.tokens |
|
scores = self.scores |
|
vocab = {t: scores[i] for i, t in enumerate(tokens)} |
|
|
|
logger.warning("Merges were not in checkpoint, building merges on the fly.") |
|
merges = [] |
|
for merge, piece_score in tqdm(vocab.items()): |
|
local = [] |
|
for index in range(1, len(merge)): |
|
piece_l, piece_r = merge[:index], merge[index:] |
|
if piece_l in tokens and piece_r in tokens: |
|
local.append((piece_l, piece_r, piece_score)) |
|
local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]), reverse=True) |
|
merges.extend(local) |
|
merges = sorted(merges, key=lambda val: val[2], reverse=True) |
|
merges = [(val[0], val[1]) for val in merges] |
|
self.merges = merges |
|
else: |
|
self.merges = [tuple(merge.split(" ")) for merge in self.merges] |
|
if not hasattr(self, "scores"): |
|
self.scores = [None for _ in range(len(self.tokens))] |
|
|
|
if not hasattr(self, "added_tokens"): |
|
self.added_tokens = [] |
|
|
|
if not hasattr(self, "unk_token_id"): |
|
self.unk_token_id = None |
|
|
|
|
|
if hasattr(self, "unknown_token_id") and self.unk_token_id is None: |
|
self.unk_token_id = self.unknown_token_id |
|
|
|
|
|
class GGUFLlamaConverter(LlamaConverter): |
|
def __init__(self, tokenizer_dict): |
|
self.proto = GGUFTokenizerSkeleton(tokenizer_dict) |
|
self.original_tokenizer = self.proto |
|
self.additional_kwargs = {} |
|
self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama" |
|
|
|
def vocab(self, proto): |
|
return list(zip(proto.tokens, proto.scores)) |
|
|
|
def merges(self, proto): |
|
return proto.merges |
|
|
|
def tokenizer(self, proto): |
|
vocab_scores = self.vocab(self.proto) |
|
merges = self.merges(self.proto) |
|
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)} |
|
|
|
unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None |
|
bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None |
|
eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None |
|
|
|
tokenizer = Tokenizer(BPE(bpe_vocab, merges, unk_token=unk_token, fuse_unk=True, byte_fallback=True)) |
|
|
|
special_tokens = [] |
|
|
|
if not hasattr(self.proto, "token_type"): |
|
if unk_token is not None: |
|
special_tokens.append(AddedToken(unk_token, normalized=False, special=True)) |
|
|
|
if bos_token is not None: |
|
special_tokens.append(AddedToken(bos_token, normalized=False, special=True)) |
|
|
|
if eos_token is not None: |
|
special_tokens.append(AddedToken(eos_token, normalized=False, special=True)) |
|
else: |
|
|
|
special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0] |
|
|
|
for idx in special_tokens_idx: |
|
special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True)) |
|
|
|
if len(special_tokens) != 0: |
|
tokenizer.add_special_tokens(special_tokens) |
|
|
|
if len(self.proto.added_tokens) != 0: |
|
tokenizer.add_tokens( |
|
[AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens] |
|
) |
|
|
|
self.additional_kwargs["unk_token"] = unk_token |
|
self.additional_kwargs["eos_token"] = bos_token |
|
self.additional_kwargs["bos_token"] = eos_token |
|
|
|
if self.is_llama_3_tokenizer: |
|
self.additional_kwargs["add_prefix_space"] = None |
|
self.additional_kwargs["clean_up_tokenization_spaces"] = True |
|
|
|
self.additional_kwargs["legacy"] = False |
|
self.original_tokenizer.legacy = False |
|
|
|
return tokenizer |
|
|
|
def decoder(self, replacement, add_prefix_space): |
|
sequence = [ |
|
decoders.ByteFallback(), |
|
decoders.Fuse(), |
|
decoders.Replace("▁", " "), |
|
] |
|
|
|
if self.is_llama_3_tokenizer: |
|
sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)] |
|
|
|
if add_prefix_space: |
|
sequence += [decoders.Strip(content=" ", left=1)] |
|
return decoders.Sequence(sequence) |
|
|
|
def converted(self): |
|
|
|
tokenizer = self.tokenizer(self.proto) |
|
|
|
|
|
normalizer = self.normalizer(self.proto) |
|
if normalizer is not None: |
|
tokenizer.normalizer = normalizer |
|
|
|
replacement = "▁" |
|
add_prefix_space = True |
|
if hasattr(self.original_tokenizer, "add_prefix_space"): |
|
add_prefix_space = self.original_tokenizer.add_prefix_space |
|
|
|
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) |
|
if pre_tokenizer is not None: |
|
tokenizer.pre_tokenizer = pre_tokenizer |
|
|
|
tokenizer.decoder = self.decoder(replacement, add_prefix_space) |
|
post_processor = self.post_processor() |
|
if post_processor: |
|
tokenizer.post_processor = post_processor |
|
|
|
|
|
|
|
if self.is_llama_3_tokenizer: |
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( |
|
add_prefix_space=False, trim_offsets=False, use_regex=True |
|
) |
|
|
|
|
|
tokenizer.normalizer = normalizers.Sequence([]) |
|
|
|
return tokenizer |
|
|
|
|
|
class GGUFQwen2Converter(Qwen2Converter): |
|
def __init__(self, tokenizer_dict): |
|
self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict) |
|
self.additional_kwargs = {} |
|
|
|
def converted(self) -> Tokenizer: |
|
vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)} |
|
merges = self.original_tokenizer.merges |
|
tokenizer = super().converted(vocab, merges) |
|
|
|
tokenizer.add_special_tokens( |
|
[ |
|
AddedToken("<|endoftext|>", normalized=False, special=True), |
|
AddedToken("<|im_start|>", normalized=False, special=True), |
|
AddedToken("<|im_end|>", normalized=False, special=True), |
|
] |
|
) |
|
return tokenizer |
|
|
|
|
|
GGUF_TO_FAST_CONVERTERS = { |
|
"llama": GGUFLlamaConverter, |
|
"qwen2": GGUFQwen2Converter, |
|
} |
|
|
|
|
|
def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer: |
|
""" |
|
Utilities to convert a slow tokenizer instance in a fast tokenizer instance. |
|
|
|
Args: |
|
architecture (`str`): The model architecture derived from gguf file. |
|
transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]): |
|
Instance of a slow tokenizer to convert in the backend tokenizer for |
|
[`~tokenization_utils_base.PreTrainedTokenizerFast`]. |
|
|
|
Return: |
|
A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a |
|
[`~tokenization_utils_base.PreTrainedTokenizerFast`] |
|
""" |
|
tokenizer_class_name = architecture |
|
converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict) |
|
fast_tokenizer = converter.converted() |
|
return fast_tokenizer, converter.additional_kwargs |
|
|