import gguf import torch quants_mapping = { gguf.GGMLQuantizationType.Q2_K: gguf.Q2_K, gguf.GGMLQuantizationType.Q3_K: gguf.Q3_K, gguf.GGMLQuantizationType.Q4_0: gguf.Q4_0, gguf.GGMLQuantizationType.Q4_K: gguf.Q4_K, gguf.GGMLQuantizationType.Q4_1: gguf.Q4_1, gguf.GGMLQuantizationType.Q5_0: gguf.Q5_0, gguf.GGMLQuantizationType.Q5_1: gguf.Q5_1, gguf.GGMLQuantizationType.Q5_K: gguf.Q5_K, gguf.GGMLQuantizationType.Q6_K: gguf.Q6_K, gguf.GGMLQuantizationType.Q8_0: gguf.Q8_0, } class ParameterGGUF(torch.nn.Parameter): def __init__(self, tensor=None, requires_grad=False, no_init=False): super().__init__() if no_init: return self.gguf_cls = quants_mapping.get(tensor.tensor_type, None) self.real_shape = torch.Size(reversed(list(tensor.shape))) self.computation_dtype = torch.float16 self.baked = False return @property def shape(self): return self.real_shape def __new__(cls, tensor=None, requires_grad=False, no_init=False): return super().__new__(cls, torch.tensor(tensor.data), requires_grad=requires_grad) def dequantize_as_pytorch_parameter(self): if self.gguf_cls is not None: self.gguf_cls.bake(self) return torch.nn.Parameter(dequantize_tensor(self), requires_grad=False) def copy_with_data(self, data): new = ParameterGGUF(data, no_init=True) new.gguf_cls = self.gguf_cls new.real_shape = self.real_shape new.computation_dtype = self.computation_dtype new.baked = self.baked return new def to(self, *args, **kwargs): return self.copy_with_data(self.data.to(*args, **kwargs)) def pin_memory(self, device=None): return self.copy_with_data(torch.Tensor.pin_memory(self, device=device)) def dequantize_tensor(tensor): if tensor is None: return None if not hasattr(tensor, 'gguf_cls'): return tensor gguf_cls = tensor.gguf_cls if gguf_cls is None: return tensor return gguf_cls.dequantize_pytorch(tensor)