import gguf import argparse import logging import sys import torch import json import os import numpy as np from typing import cast, ContextManager, Any, Iterator from pathlib import Path from torch import Tensor logger = logging.getLogger("gemma3-mmproj") # (copied from convert_hf_to_gguf.py) # tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor # to keep the type-checker happy dtype: torch.dtype shape: torch.Size # only used when converting a torch.Tensor to a np.ndarray _dtype_map: dict[torch.dtype, type] = { torch.float16: np.float16, torch.float32: np.float32, } # used for safetensors slices # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 _dtype_str_map: dict[str, torch.dtype] = { "F64": torch.float64, "F32": torch.float32, "BF16": torch.bfloat16, "F16": torch.float16, # "U64": torch.uint64, "I64": torch.int64, # "U32": torch.uint32, "I32": torch.int32, # "U16": torch.uint16, "I16": torch.int16, "U8": torch.uint8, "I8": torch.int8, "BOOL": torch.bool, "F8_E4M3": torch.float8_e4m3fn, "F8_E5M2": torch.float8_e5m2, } def numpy(self) -> gguf.LazyNumpyTensor: dtype = self._dtype_map[self.dtype] return gguf.LazyNumpyTensor( meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), args=(self,), func=(lambda s: s.numpy()) ) @classmethod def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: return torch.empty(size=shape, dtype=dtype, device="meta") @classmethod def from_safetensors_slice(cls, st_slice: Any) -> Tensor: dtype = cls._dtype_str_map[st_slice.get_dtype()] shape: tuple[int, ...] = tuple(st_slice.get_shape()) lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) return cast(torch.Tensor, lazy) @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): del types # unused if kwargs is None: kwargs = {} if func is torch.Tensor.numpy: return args[0].numpy() return cls._wrap_fn(func)(*args, **kwargs) class Gemma3VisionTower: hparams: dict gguf_writer: gguf.GGUFWriter fname_out: Path ftype: gguf.LlamaFileType @staticmethod def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: return json.load(f) @staticmethod def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: part_names: list[str] = [] for filename in os.listdir(dir_model): if filename.startswith(prefix) and filename.endswith(suffix): part_names.append(filename) part_names.sort() return part_names def __init__(self, dir_model: Path, fname_out: Path, ftype: gguf.LlamaFileType, is_big_endian: bool,): hparams = Gemma3VisionTower.load_hparams(dir_model) self.hparams = hparams self.fname_out = fname_out self.ftype = ftype endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess) text_config = hparams["text_config"] vision_config = hparams["vision_config"] assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration" assert text_config is not None assert vision_config is not None self.gguf_writer.add_string ("clip.projector_type", "gemma3") self.gguf_writer.add_bool ("clip.has_text_encoder", False) self.gguf_writer.add_bool ("clip.has_vision_encoder", True) self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"]) self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"]) self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"]) self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"]) self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"]) self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"]) self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"]) self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6)) # default values taken from HF tranformers code self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5]) self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5]) self.gguf_writer.add_bool ("clip.use_gelu", True) # load tensors for name, data_torch in self.get_tensors(dir_model): # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) self.add_tensor(name, data_torch) def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]: part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors") tensor_names_from_parts: set[str] = set() for part_name in part_names: logger.info(f"gguf: loading model part '{part_name}'") from safetensors import safe_open ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu")) with ctx as model_part: tensor_names_from_parts.update(model_part.keys()) for name in model_part.keys(): data = model_part.get_slice(name) data = LazyTorchTensor.from_safetensors_slice(data) yield name, data def add_tensor(self, name: str, data_torch: Tensor): is_1d = len(data_torch.shape) == 1 is_embd = ".embeddings." in name old_dtype = data_torch.dtype can_quantize = not is_1d and not is_embd data_qtype = gguf.GGMLQuantizationType.F32 # this is to support old checkpoint # TODO: remove this when we have the final model name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.") name = name.replace("multimodal_projector.", "multi_modal_projector.") # filter only vision tensors if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."): return # prefix name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.") name = name.replace("vision_tower.vision_model.", "v.") # projector and input embd name = name.replace(".embeddings.patch_embedding.", ".patch_embd.") name = name.replace(".embeddings.position_embedding.", ".position_embd.") name = name.replace( "multi_modal_projector.mm_input_projection_weight", "mm.input_projection.weight" ) name = name.replace( "multi_modal_projector.mm_soft_emb_norm.weight", "mm.soft_emb_norm.weight" ) name = name.replace("post_layernorm.", "post_ln.") # each block name = name.replace(".self_attn.k_proj.", ".attn_k.") name = name.replace(".self_attn.v_proj.", ".attn_v.") name = name.replace(".self_attn.q_proj.", ".attn_q.") name = name.replace(".self_attn.out_proj.", ".attn_out.") name = name.replace(".layer_norm1.", ".ln1.") name = name.replace(".layer_norm2.", ".ln2.") name = name.replace(".mlp.fc1.", ".ffn_down.") name = name.replace(".mlp.fc2.", ".ffn_up.") if can_quantize: if self.ftype == gguf.LlamaFileType.ALL_F32: data_qtype = gguf.GGMLQuantizationType.F32 elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: data_qtype = gguf.GGMLQuantizationType.F16 elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 else: raise ValueError(f"Unsupported file type: {self.ftype}") # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector # the other norm values are part of SigLIP model, and they are already correct # ref code: Gemma3RMSNorm if "soft_emb_norm.weight" in name: logger.info(f"Correcting norm value for '{name}'") data_torch = data_torch + 1 data = data_torch.numpy() try: data = gguf.quants.quantize(data, data_qtype) except Exception as e: logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") data_qtype = gguf.GGMLQuantizationType.F16 data = gguf.quants.quantize(data, data_qtype) # reverse shape to make it similar to the internal ggml dimension order shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) def write(self): self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Convert Gemma 3 vision tower safetensors to GGUF format",) parser.add_argument( "--outfile", type=Path, default="mmproj.gguf", help="path to write to", ) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", help="output format", ) parser.add_argument( "--bigendian", action="store_true", help="model is executed on big endian machine", ) parser.add_argument( "model", type=Path, help="directory containing model file", nargs="?", ) parser.add_argument( "--verbose", action="store_true", help="increase output verbosity", ) args = parser.parse_args() if args.model is None: parser.error("the following arguments are required: model") return args def main() -> None: args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) dir_model = args.model if not dir_model.is_dir(): logger.error(f'Error: {args.model} is not a directory') sys.exit(1) ftype_map: dict[str, gguf.LlamaFileType] = { "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, } logger.info(f"Loading model: {dir_model.name}") with torch.inference_mode(): gemma3_vision_tower = Gemma3VisionTower( dir_model=dir_model, fname_out=args.outfile, ftype=ftype_map[args.outtype], is_big_endian=args.bigendian, ) gemma3_vision_tower.write() if __name__ == '__main__': main()