#ifndef __CLIP_HPP__ #define __CLIP_HPP__ #include "ggml_extend.hpp" #include "model.h" /*================================================== CLIPTokenizer ===================================================*/ std::pair, std::string> extract_and_remove_lora(std::string text) { std::regex re("]+)>"); std::smatch matches; std::unordered_map filename2multiplier; while (std::regex_search(text, matches, re)) { std::string filename = matches[1].str(); float multiplier = std::stof(matches[2].str()); text = std::regex_replace(text, re, "", std::regex_constants::format_first_only); if (multiplier == 0.f) { continue; } if (filename2multiplier.find(filename) == filename2multiplier.end()) { filename2multiplier[filename] = multiplier; } else { filename2multiplier[filename] += multiplier; } } return std::make_pair(filename2multiplier, text); } std::vector> bytes_to_unicode() { std::vector> byte_unicode_pairs; std::set byte_set; for (int b = static_cast('!'); b <= static_cast('~'); ++b) { byte_set.insert(b); byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b))); } for (int b = 161; b <= 172; ++b) { byte_set.insert(b); byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b))); } for (int b = 174; b <= 255; ++b) { byte_set.insert(b); byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(b))); } int n = 0; for (int b = 0; b < 256; ++b) { if (byte_set.find(b) == byte_set.end()) { byte_unicode_pairs.push_back(std::pair(b, unicode_value_to_utf32(n + 256))); ++n; } } // LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size()); return byte_unicode_pairs; } // Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py typedef std::function&)> on_new_token_cb_t; class CLIPTokenizer { private: std::map byte_encoder; std::map byte_decoder; std::map encoder; std::map decoder; std::map, int> bpe_ranks; std::regex pat; int encoder_len; int bpe_len; public: const std::string UNK_TOKEN = "<|endoftext|>"; const std::string BOS_TOKEN = "<|startoftext|>"; const std::string EOS_TOKEN = "<|endoftext|>"; const std::string PAD_TOKEN = "<|endoftext|>"; const int UNK_TOKEN_ID = 49407; const int BOS_TOKEN_ID = 49406; const int EOS_TOKEN_ID = 49407; const int PAD_TOKEN_ID = 49407; private: static std::string strip(const std::string& str) { std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f"); std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f"); if (start == std::string::npos) { // String contains only whitespace characters return ""; } return str.substr(start, end - start + 1); } static std::string whitespace_clean(std::string text) { text = std::regex_replace(text, std::regex(R"(\s+)"), " "); text = strip(text); return text; } static std::set> get_pairs(const std::vector& subwords) { std::set> pairs; if (subwords.size() == 0) { return pairs; } std::u32string prev_subword = subwords[0]; for (int i = 1; i < subwords.size(); i++) { std::u32string subword = subwords[i]; std::pair pair(prev_subword, subword); pairs.insert(pair); prev_subword = subword; } return pairs; } public: CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "") : PAD_TOKEN_ID(pad_token_id) { if (merges_utf8_str.size() > 0) { load_from_merges(merges_utf8_str); } else { load_from_merges(ModelLoader::load_merges()); } } void load_from_merges(const std::string& merges_utf8_str) { auto byte_unicode_pairs = bytes_to_unicode(); // printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size()); byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); for (auto& pair : byte_unicode_pairs) { byte_decoder[pair.second] = pair.first; } // for (auto & pair: byte_unicode_pairs) { // std::cout << pair.first << ": " << pair.second << std::endl; // } std::vector merges; size_t start = 0; size_t pos; std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) { merges.push_back(merges_utf32_str.substr(start, pos - start)); start = pos + 1; } // LOG_DEBUG("merges size %llu", merges.size()); GGML_ASSERT(merges.size() == 48895); merges = std::vector(merges.begin() + 1, merges.end()); std::vector> merge_pairs; for (const auto& merge : merges) { size_t space_pos = merge.find(' '); merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); // printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(), // utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); } std::vector vocab; for (const auto& pair : byte_unicode_pairs) { vocab.push_back(pair.second); } for (const auto& pair : byte_unicode_pairs) { vocab.push_back(pair.second + utf8_to_utf32("")); } for (const auto& merge : merge_pairs) { vocab.push_back(merge.first + merge.second); } vocab.push_back(utf8_to_utf32("<|startoftext|>")); vocab.push_back(utf8_to_utf32("<|endoftext|>")); LOG_DEBUG("vocab size: %llu", vocab.size()); int i = 0; for (const auto& token : vocab) { encoder[token] = i; decoder[i] = token; i++; } encoder_len = i; auto it = encoder.find(utf8_to_utf32("img")); if (it != encoder.end()) { LOG_DEBUG(" trigger word img already in vocab"); } else { LOG_DEBUG(" trigger word img not in vocab yet"); } int rank = 0; for (const auto& merge : merge_pairs) { bpe_ranks[merge] = rank++; } bpe_len = rank; }; void add_token(const std::string& text) { std::u32string token = utf8_to_utf32(text); auto it = encoder.find(token); if (it != encoder.end()) { encoder[token] = encoder_len; decoder[encoder_len] = token; encoder_len++; } } std::u32string bpe(const std::u32string& token) { std::vector word; for (int i = 0; i < token.size() - 1; i++) { word.emplace_back(1, token[i]); } word.push_back(token.substr(token.size() - 1) + utf8_to_utf32("")); std::set> pairs = get_pairs(word); if (pairs.empty()) { return token + utf8_to_utf32(""); } while (true) { auto min_pair_iter = std::min_element(pairs.begin(), pairs.end(), [&](const std::pair& a, const std::pair& b) { if (bpe_ranks.find(a) == bpe_ranks.end()) { return false; } else if (bpe_ranks.find(b) == bpe_ranks.end()) { return true; } return bpe_ranks.at(a) < bpe_ranks.at(b); }); const std::pair& bigram = *min_pair_iter; if (bpe_ranks.find(bigram) == bpe_ranks.end()) { break; } std::u32string first = bigram.first; std::u32string second = bigram.second; std::vector new_word; int32_t i = 0; while (i < word.size()) { auto it = std::find(word.begin() + i, word.end(), first); if (it == word.end()) { new_word.insert(new_word.end(), word.begin() + i, word.end()); break; } new_word.insert(new_word.end(), word.begin() + i, it); i = static_cast(std::distance(word.begin(), it)); if (word[i] == first && i < static_cast(word.size()) - 1 && word[i + 1] == second) { new_word.push_back(first + second); i += 2; } else { new_word.push_back(word[i]); i += 1; } } word = new_word; if (word.size() == 1) { break; } pairs = get_pairs(word); } std::u32string result; for (int i = 0; i < word.size(); i++) { result += word[i]; if (i != word.size() - 1) { result += utf8_to_utf32(" "); } } return result; } std::vector tokenize(std::string text, on_new_token_cb_t on_new_token_cb, size_t max_length = 0, bool padding = false) { std::vector tokens = encode(text, on_new_token_cb); tokens.insert(tokens.begin(), BOS_TOKEN_ID); if (max_length > 0) { if (tokens.size() > max_length - 1) { tokens.resize(max_length - 1); tokens.push_back(EOS_TOKEN_ID); } else { tokens.push_back(EOS_TOKEN_ID); if (padding) { tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID); } } } return tokens; } void pad_tokens(std::vector& tokens, std::vector& weights, size_t max_length = 0, bool padding = false) { if (max_length > 0 && padding) { size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2)); if (n == 0) { n = 1; } size_t length = max_length * n; LOG_DEBUG("token length: %llu", length); std::vector new_tokens; std::vector new_weights; new_tokens.push_back(BOS_TOKEN_ID); new_weights.push_back(1.0); int token_idx = 0; for (int i = 1; i < length; i++) { if (token_idx >= tokens.size()) { break; } if (i % max_length == 0) { new_tokens.push_back(BOS_TOKEN_ID); new_weights.push_back(1.0); } else if (i % max_length == max_length - 1) { new_tokens.push_back(EOS_TOKEN_ID); new_weights.push_back(1.0); } else { new_tokens.push_back(tokens[token_idx]); new_weights.push_back(weights[token_idx]); token_idx++; } } new_tokens.push_back(EOS_TOKEN_ID); new_weights.push_back(1.0); tokens = new_tokens; weights = new_weights; if (padding) { tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID); weights.insert(weights.end(), length - weights.size(), 1.0); } } } std::string clean_up_tokenization(std::string& text) { std::regex pattern(R"( ,)"); // Replace " ," with "," std::string result = std::regex_replace(text, pattern, ","); return result; } std::string decode(const std::vector& tokens) { std::string text = ""; for (int t : tokens) { if (t == 49406 || t == 49407) continue; std::u32string ts = decoder[t]; // printf("%d, %s \n", t, utf32_to_utf8(ts).c_str()); std::string s = utf32_to_utf8(ts); if (s.length() >= 4) { if (ends_with(s, "")) { text += s.replace(s.length() - 4, s.length() - 1, "") + " "; } else { text += s; } } else { text += " " + s; } } // std::vector bytes; // for (auto c : text){ // bytes.push_back(byte_decoder[c]); // } // std::string s((char *)bytes.data()); // std::string s = ""; text = clean_up_tokenization(text); return trim(text); } std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb) { std::string original_text = text; std::vector bpe_tokens; text = whitespace_clean(text); std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); }); std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)", std::regex::icase); std::smatch matches; std::string str = text; std::vector token_strs; while (std::regex_search(str, matches, pat)) { bool skip = on_new_token_cb(str, bpe_tokens); if (skip) { continue; } for (auto& token : matches) { std::string token_str = token.str(); std::u32string utf32_token; for (int i = 0; i < token_str.length(); i++) { unsigned char b = token_str[i]; utf32_token += byte_encoder[b]; } auto bpe_strs = bpe(utf32_token); size_t start = 0; size_t pos; while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) { auto bpe_str = bpe_strs.substr(start, pos - start); bpe_tokens.push_back(encoder[bpe_str]); token_strs.push_back(utf32_to_utf8(bpe_str)); start = pos + 1; } auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start); bpe_tokens.push_back(encoder[bpe_str]); token_strs.push_back(utf32_to_utf8(bpe_str)); } str = matches.suffix(); } std::stringstream ss; ss << "["; for (auto token : token_strs) { ss << "\"" << token << "\", "; } ss << "]"; // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); return bpe_tokens; } }; /*================================================ FrozenCLIPEmbedder ================================================*/ // Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py struct CLIPMLP : public GGMLBlock { protected: bool use_gelu; public: CLIPMLP(int64_t d_model, int64_t intermediate_size) { blocks["fc1"] = std::shared_ptr(new Linear(d_model, intermediate_size)); blocks["fc2"] = std::shared_ptr(new Linear(intermediate_size, d_model)); if (d_model == 1024 || d_model == 1280) { // SD 2.x use_gelu = true; } else { // SD 1.x use_gelu = false; } } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, n_token, d_model] auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]); auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]); x = fc1->forward(ctx, x); if (use_gelu) { x = ggml_gelu_inplace(ctx, x); } else { x = ggml_gelu_quick_inplace(ctx, x); } x = fc2->forward(ctx, x); return x; } }; struct CLIPLayer : public GGMLBlock { protected: int64_t d_model; // hidden_size/embed_dim int64_t n_head; int64_t intermediate_size; public: CLIPLayer(int64_t d_model, int64_t n_head, int64_t intermediate_size) : d_model(d_model), n_head(n_head), intermediate_size(intermediate_size) { blocks["self_attn"] = std::shared_ptr(new MultiheadAttention(d_model, n_head, true, true)); blocks["layer_norm1"] = std::shared_ptr(new LayerNorm(d_model)); blocks["layer_norm2"] = std::shared_ptr(new LayerNorm(d_model)); blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) { // x: [N, n_token, d_model] auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]); auto layer_norm2 = std::dynamic_pointer_cast(blocks["layer_norm2"]); auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask)); x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x))); return x; } }; struct CLIPEncoder : public GGMLBlock { protected: int64_t n_layer; public: CLIPEncoder(int64_t n_layer, int64_t d_model, int64_t n_head, int64_t intermediate_size) : n_layer(n_layer) { for (int i = 0; i < n_layer; i++) { std::string name = "layers." + std::to_string(i); blocks[name] = std::shared_ptr(new CLIPLayer(d_model, n_head, intermediate_size)); } } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) { // x: [N, n_token, d_model] int layer_idx = n_layer - 1; // LOG_DEBUG("clip_skip %d", clip_skip); if (clip_skip > 0) { layer_idx = n_layer - clip_skip; } for (int i = 0; i < n_layer; i++) { // LOG_DEBUG("layer %d", i); if (i == layer_idx + 1) { break; } std::string name = "layers." + std::to_string(i); auto layer = std::dynamic_pointer_cast(blocks[name]); x = layer->forward(ctx, x, mask); // [N, n_token, d_model] // LOG_DEBUG("layer %d", i); } return x; } }; class CLIPEmbeddings : public GGMLBlock { protected: int64_t embed_dim; int64_t vocab_size; int64_t num_positions; void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { enum ggml_type token_wtype = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions); } public: CLIPEmbeddings(int64_t embed_dim, int64_t vocab_size = 49408, int64_t num_positions = 77) : embed_dim(embed_dim), vocab_size(vocab_size), num_positions(num_positions) { } struct ggml_tensor* get_token_embed_weight() { return params["token_embedding.weight"]; } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids, struct ggml_tensor* custom_embed_weight) { // input_ids: [N, n_token] auto token_embed_weight = params["token_embedding.weight"]; auto position_embed_weight = params["position_embedding.weight"]; GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]); input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]); auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids); token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]); // token_embedding + position_embedding auto x = ggml_add(ctx, token_embedding, position_embed_weight); // [N, n_token, embed_dim] return x; } }; class CLIPVisionEmbeddings : public GGMLBlock { protected: int64_t embed_dim; int64_t num_channels; int64_t patch_size; int64_t image_size; int64_t num_patches; int64_t num_positions; void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16; enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32; params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim); params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim); params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions); } public: CLIPVisionEmbeddings(int64_t embed_dim, int64_t num_channels = 3, int64_t patch_size = 14, int64_t image_size = 224) : embed_dim(embed_dim), num_channels(num_channels), patch_size(patch_size), image_size(image_size) { num_patches = (image_size / patch_size) * (image_size / patch_size); num_positions = num_patches + 1; } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) { // pixel_values: [N, num_channels, image_size, image_size] // return: [N, num_positions, embed_dim] GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels); auto patch_embed_weight = params["patch_embedding.weight"]; auto class_embed_weight = params["class_embedding"]; auto position_embed_weight = params["position_embedding.weight"]; // concat(patch_embedding, class_embedding) + position_embedding struct ggml_tensor* patch_embedding; int64_t N = pixel_values->ne[3]; patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N); class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim] class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1] x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] x = ggml_add(ctx, x, position_embed_weight); return x; // [N, num_positions, embed_dim] } }; // OPENAI_CLIP_VIT_L_14: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json // OPEN_CLIP_VIT_H_14: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json // OPEN_CLIP_VIT_BIGG_14: https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/blob/main/config.json (CLIPTextModelWithProjection) enum CLIPVersion { OPENAI_CLIP_VIT_L_14, // SD 1.x and SDXL OPEN_CLIP_VIT_H_14, // SD 2.x OPEN_CLIP_VIT_BIGG_14, // SDXL }; class CLIPTextModel : public GGMLBlock { protected: void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { if (version == OPEN_CLIP_VIT_BIGG_14) { enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32; params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); } } public: CLIPVersion version = OPENAI_CLIP_VIT_L_14; // network hparams int32_t vocab_size = 49408; int32_t n_token = 77; // max_position_embeddings int32_t hidden_size = 768; int32_t intermediate_size = 3072; int32_t n_head = 12; int32_t n_layer = 12; // num_hidden_layers int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14 int32_t clip_skip = -1; bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, int clip_skip_value = -1, bool with_final_ln = true) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; intermediate_size = 4096; n_head = 16; n_layer = 24; } else if (version == OPEN_CLIP_VIT_BIGG_14) { // CLIPTextModelWithProjection hidden_size = 1280; intermediate_size = 5120; n_head = 20; n_layer = 32; } set_clip_skip(clip_skip_value); blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } void set_clip_skip(int skip) { if (skip <= 0) { return; } clip_skip = skip; } struct ggml_tensor* get_token_embed_weight() { auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); return embeddings->get_token_embed_weight(); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids, struct ggml_tensor* tkn_embeddings, size_t max_token_idx = 0, bool return_pooled = false) { // input_ids: [N, n_token] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]); auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size] x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true); if (return_pooled || with_final_ln) { x = final_layer_norm->forward(ctx, x); } if (return_pooled) { auto text_projection = params["text_projection"]; ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); if (text_projection != NULL) { pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL); } else { LOG_DEBUG("Missing text_projection matrix, assuming identity..."); } return pooled; // [hidden_size, 1, 1] } return x; // [N, n_token, hidden_size] } }; class CLIPVisionModel : public GGMLBlock { public: // network hparams int32_t num_channels = 3; int32_t patch_size = 14; int32_t image_size = 224; int32_t num_positions = 257; // (image_size / patch_size)^2 + 1 int32_t hidden_size = 1024; int32_t intermediate_size = 4096; int32_t n_head = 16; int32_t n_layer = 24; public: CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1280; intermediate_size = 5120; n_head = 16; n_layer = 32; } else if (version == OPEN_CLIP_VIT_BIGG_14) { hidden_size = 1664; intermediate_size = 8192; n_head = 16; n_layer = 48; } blocks["embeddings"] = std::shared_ptr(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size)); blocks["pre_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) { // pixel_values: [N, num_channels, image_size, image_size] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto pre_layernorm = std::dynamic_pointer_cast(blocks["pre_layernorm"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); auto post_layernorm = std::dynamic_pointer_cast(blocks["post_layernorm"]); auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim] x = pre_layernorm->forward(ctx, x); x = encoder->forward(ctx, x, -1, false); // print_ggml_tensor(x, true, "ClipVisionModel x: "); auto last_hidden_state = x; x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size] GGML_ASSERT(x->ne[3] == 1); if (return_pooled) { ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0)); return pooled; // [N, hidden_size] } else { // return x; // [N, n_token, hidden_size] return last_hidden_state; // [N, n_token, hidden_size] } } }; class CLIPProjection : public UnaryBlock { protected: int64_t in_features; int64_t out_features; bool transpose_weight; void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; if (transpose_weight) { params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); } else { params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features); } } public: CLIPProjection(int64_t in_features, int64_t out_features, bool transpose_weight = false) : in_features(in_features), out_features(out_features), transpose_weight(transpose_weight) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; if (transpose_weight) { w = ggml_cont(ctx, ggml_transpose(ctx, w)); } return ggml_nn_linear(ctx, x, w, NULL); } }; class CLIPVisionModelProjection : public GGMLBlock { public: int32_t hidden_size = 1024; int32_t projection_dim = 768; int32_t image_size = 224; public: CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool transpose_proj_w = false) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1280; projection_dim = 1024; } else if (version == OPEN_CLIP_VIT_BIGG_14) { hidden_size = 1664; } blocks["vision_model"] = std::shared_ptr(new CLIPVisionModel(version)); blocks["visual_projection"] = std::shared_ptr(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) { // pixel_values: [N, num_channels, image_size, image_size] // return: [N, projection_dim] auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]); auto visual_projection = std::dynamic_pointer_cast(blocks["visual_projection"]); auto x = vision_model->forward(ctx, pixel_values); // [N, hidden_size] x = visual_projection->forward(ctx, x); // [N, projection_dim] return x; // [N, projection_dim] } }; struct CLIPTextModelRunner : public GGMLRunner { CLIPTextModel model; CLIPTextModelRunner(ggml_backend_t backend, std::map& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, int clip_skip_value = 1, bool with_final_ln = true) : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) { model.init(params_ctx, tensor_types, prefix); } std::string get_desc() { return "clip"; } void set_clip_skip(int clip_skip) { model.set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids, struct ggml_tensor* embeddings, size_t max_token_idx = 0, bool return_pooled = false) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; if (input_ids->ne[0] > model.n_token) { GGML_ASSERT(input_ids->ne[0] % model.n_token == 0); input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); } return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled); } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, int num_custom_embeddings = 0, void* custom_embeddings_data = NULL, size_t max_token_idx = 0, bool return_pooled = false) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); struct ggml_tensor* embeddings = NULL; if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) { auto token_embed_weight = model.get_token_embed_weight(); auto custom_embeddings = ggml_new_tensor_2d(compute_ctx, token_embed_weight->type, model.hidden_size, num_custom_embeddings); set_backend_tensor_data(custom_embeddings, custom_embeddings_data); // concatenate custom embeddings embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); } struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled); ggml_build_forward_expand(gf, hidden_states); return gf; } void compute(const int n_threads, struct ggml_tensor* input_ids, int num_custom_embeddings, void* custom_embeddings_data, size_t max_token_idx, bool return_pooled, ggml_tensor** output, ggml_context* output_ctx = NULL) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled); }; GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } }; #endif // __CLIP_HPP__