|
#include "ggml_extend.hpp" |
|
|
|
#include "model.h" |
|
#include "rng.hpp" |
|
#include "rng_philox.hpp" |
|
#include "stable-diffusion.h" |
|
#include "util.h" |
|
|
|
#include "conditioner.hpp" |
|
#include "control.hpp" |
|
#include "denoiser.hpp" |
|
#include "diffusion_model.hpp" |
|
#include "esrgan.hpp" |
|
#include "lora.hpp" |
|
#include "pmid.hpp" |
|
#include "tae.hpp" |
|
#include "vae.hpp" |
|
|
|
#include "stb_image.h" |
|
|
|
#include <inttypes.h> |
|
#include <cinttypes> |
|
static std::string pending_apply_lora_fname = ""; |
|
static float pending_apply_lora_power = 1.0f; |
|
|
|
const char* model_version_to_str[] = { |
|
"SD 1.x", |
|
"SD 2.x", |
|
"SDXL", |
|
"SVD", |
|
"SD3.x", |
|
"Flux"}; |
|
|
|
const char* sampling_methods_str[] = { |
|
"Euler A", |
|
"Euler", |
|
"Heun", |
|
"DPM2", |
|
"DPM++ (2s)", |
|
"DPM++ (2M)", |
|
"modified DPM++ (2M)", |
|
"iPNDM", |
|
"iPNDM_v", |
|
"LCM", |
|
}; |
|
|
|
|
|
|
|
void calculate_alphas_cumprod(float* alphas_cumprod, |
|
float linear_start = 0.00085f, |
|
float linear_end = 0.0120, |
|
int timesteps = TIMESTEPS) { |
|
float ls_sqrt = sqrtf(linear_start); |
|
float le_sqrt = sqrtf(linear_end); |
|
float amount = le_sqrt - ls_sqrt; |
|
float product = 1.0f; |
|
for (int i = 0; i < timesteps; i++) { |
|
float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); |
|
product *= 1.0f - powf(beta, 2.0f); |
|
alphas_cumprod[i] = product; |
|
} |
|
} |
|
|
|
|
|
|
|
class StableDiffusionGGML { |
|
public: |
|
ggml_backend_t backend = NULL; |
|
ggml_backend_t clip_backend = NULL; |
|
ggml_backend_t control_net_backend = NULL; |
|
ggml_backend_t vae_backend = NULL; |
|
ggml_type model_wtype = GGML_TYPE_COUNT; |
|
ggml_type conditioner_wtype = GGML_TYPE_COUNT; |
|
ggml_type diffusion_model_wtype = GGML_TYPE_COUNT; |
|
ggml_type vae_wtype = GGML_TYPE_COUNT; |
|
|
|
SDVersion version; |
|
bool vae_decode_only = false; |
|
bool free_params_immediately = false; |
|
|
|
std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>(); |
|
int n_threads = -1; |
|
float scale_factor = 0.18215f; |
|
|
|
std::shared_ptr<Conditioner> cond_stage_model; |
|
std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision; |
|
std::shared_ptr<DiffusionModel> diffusion_model; |
|
std::shared_ptr<AutoEncoderKL> first_stage_model; |
|
std::shared_ptr<TinyAutoEncoder> tae_first_stage; |
|
std::shared_ptr<ControlNet> control_net; |
|
std::shared_ptr<PhotoMakerIDEncoder> pmid_model; |
|
std::shared_ptr<LoraModel> pmid_lora; |
|
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds; |
|
|
|
std::string taesd_path; |
|
bool use_tiny_autoencoder = false; |
|
bool vae_tiling = false; |
|
bool stacked_id = false; |
|
|
|
std::map<std::string, struct ggml_tensor*> tensors; |
|
|
|
std::string lora_model_dir; |
|
|
|
std::unordered_map<std::string, float> curr_lora_state; |
|
|
|
std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>(); |
|
|
|
StableDiffusionGGML() = default; |
|
|
|
StableDiffusionGGML(int n_threads, |
|
bool vae_decode_only, |
|
bool free_params_immediately, |
|
std::string lora_model_dir, |
|
rng_type_t rng_type) |
|
: n_threads(n_threads), |
|
vae_decode_only(vae_decode_only), |
|
free_params_immediately(free_params_immediately), |
|
lora_model_dir(lora_model_dir) { |
|
if (rng_type == STD_DEFAULT_RNG) { |
|
rng = std::make_shared<STDDefaultRNG>(); |
|
} else if (rng_type == CUDA_RNG) { |
|
rng = std::make_shared<PhiloxRNG>(); |
|
} |
|
} |
|
|
|
~StableDiffusionGGML() { |
|
if (clip_backend != backend) { |
|
ggml_backend_free(clip_backend); |
|
} |
|
if (control_net_backend != backend) { |
|
ggml_backend_free(control_net_backend); |
|
} |
|
if (vae_backend != backend) { |
|
ggml_backend_free(vae_backend); |
|
} |
|
ggml_backend_free(backend); |
|
} |
|
|
|
bool load_from_file(const std::string& model_path, |
|
const std::string& clip_l_path, |
|
const std::string& clip_g_path, |
|
const std::string& t5xxl_path, |
|
const std::string& diffusion_model_path, |
|
const std::string& vae_path, |
|
const std::string control_net_path, |
|
const std::string embeddings_path, |
|
const std::string id_embeddings_path, |
|
const std::string& taesd_path, |
|
bool vae_tiling_, |
|
ggml_type wtype, |
|
schedule_t schedule, |
|
bool clip_on_cpu, |
|
bool control_net_cpu, |
|
bool vae_on_cpu, |
|
bool diffusion_flash_attn) { |
|
use_tiny_autoencoder = taesd_path.size() > 0; |
|
std::string taesd_path_fixed = taesd_path; |
|
#ifdef SD_USE_CUBLAS |
|
LOG_DEBUG("Using CUDA backend"); |
|
backend = ggml_backend_cuda_init(0); |
|
#endif |
|
#ifdef SD_USE_METAL |
|
LOG_DEBUG("Using Metal backend"); |
|
backend = ggml_backend_metal_init(); |
|
#endif |
|
#ifdef SD_USE_VULKAN |
|
LOG_DEBUG("Using Vulkan backend"); |
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) { |
|
backend = ggml_backend_vk_init(device); |
|
} |
|
if (!backend) { |
|
LOG_WARN("Failed to initialize Vulkan backend"); |
|
} |
|
#endif |
|
#ifdef SD_USE_SYCL |
|
LOG_DEBUG("Using SYCL backend"); |
|
backend = ggml_backend_sycl_init(0); |
|
#endif |
|
|
|
if (!backend) { |
|
LOG_DEBUG("Using CPU backend"); |
|
backend = ggml_backend_cpu_init(); |
|
} |
|
|
|
ModelLoader model_loader; |
|
|
|
vae_tiling = vae_tiling_; |
|
|
|
if (model_path.size() > 0) { |
|
LOG_INFO("loading model from '%s'", model_path.c_str()); |
|
if (!model_loader.init_from_file(model_path)) { |
|
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); |
|
} |
|
} |
|
|
|
if (clip_l_path.size() > 0) { |
|
LOG_INFO("loading clip_l from '%s'", clip_l_path.c_str()); |
|
if (!model_loader.init_from_file(clip_l_path, "text_encoders.clip_l.transformer.")) { |
|
LOG_WARN("loading clip_l from '%s' failed", clip_l_path.c_str()); |
|
} |
|
} |
|
|
|
if (clip_g_path.size() > 0) { |
|
LOG_INFO("loading clip_g from '%s'", clip_g_path.c_str()); |
|
if (!model_loader.init_from_file(clip_g_path, "text_encoders.clip_g.transformer.")) { |
|
LOG_WARN("loading clip_g from '%s' failed", clip_g_path.c_str()); |
|
} |
|
} |
|
|
|
if (t5xxl_path.size() > 0) { |
|
LOG_INFO("loading t5xxl from '%s'", t5xxl_path.c_str()); |
|
if (!model_loader.init_from_file(t5xxl_path, "text_encoders.t5xxl.transformer.")) { |
|
LOG_WARN("loading t5xxl from '%s' failed", t5xxl_path.c_str()); |
|
} |
|
} |
|
|
|
if (diffusion_model_path.size() > 0) { |
|
LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str()); |
|
if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) { |
|
LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str()); |
|
} |
|
} |
|
|
|
if (vae_path.size() > 0) { |
|
LOG_INFO("loading vae from '%s'", vae_path.c_str()); |
|
if (!model_loader.init_from_file(vae_path, "vae.")) { |
|
LOG_WARN("loading vae from '%s' failed", vae_path.c_str()); |
|
} |
|
} |
|
|
|
version = model_loader.get_sd_version(); |
|
|
|
if (version == VERSION_COUNT && model_path.size() > 0 && clip_l_path.size() > 0 && diffusion_model_path.size() == 0 && t5xxl_path.size() > 0) { |
|
bool endswithsafetensors = (model_path.rfind(".safetensors") == model_path.size() - 12); |
|
if(endswithsafetensors && !model_loader.has_diffusion_model_tensors()) |
|
{ |
|
LOG_INFO("SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n"); |
|
if (!model_loader.init_from_file(model_path, "model.diffusion_model.")) { |
|
LOG_WARN("loading diffusion model from '%s' failed", model_path.c_str()); |
|
} |
|
version = model_loader.get_sd_version(); |
|
} |
|
} |
|
|
|
if (version == VERSION_COUNT) { |
|
LOG_ERROR("Error: get SD version from file failed: '%s'", model_path.c_str()); |
|
return false; |
|
} |
|
|
|
LOG_INFO("Version: %s ", model_version_to_str[version]); |
|
|
|
if(use_tiny_autoencoder) |
|
{ |
|
std::string to_search = "taesd.embd"; |
|
std::string to_replace = ""; |
|
if(version==VERSION_SDXL) |
|
{ |
|
to_replace = "taesd_xl.embd"; |
|
} |
|
else if(version==VERSION_FLUX) |
|
{ |
|
to_replace = "taesd_f.embd"; |
|
} |
|
else if(version==VERSION_SD3) |
|
{ |
|
to_replace = "taesd_3.embd"; |
|
} |
|
|
|
if(to_replace!="") |
|
{ |
|
size_t pos = taesd_path_fixed.find(to_search); |
|
if (pos != std::string::npos) { |
|
taesd_path_fixed.replace(pos, to_search.length(), to_replace); |
|
} |
|
} |
|
} |
|
|
|
if (wtype == GGML_TYPE_COUNT) { |
|
model_wtype = model_loader.get_sd_wtype(); |
|
if (model_wtype == GGML_TYPE_COUNT) { |
|
model_wtype = GGML_TYPE_F32; |
|
LOG_WARN("can not get mode wtype frome weight, use f32"); |
|
} |
|
conditioner_wtype = model_loader.get_conditioner_wtype(); |
|
if (conditioner_wtype == GGML_TYPE_COUNT) { |
|
conditioner_wtype = wtype; |
|
} |
|
diffusion_model_wtype = model_loader.get_diffusion_model_wtype(); |
|
if (diffusion_model_wtype == GGML_TYPE_COUNT) { |
|
diffusion_model_wtype = wtype; |
|
} |
|
vae_wtype = model_loader.get_vae_wtype(); |
|
|
|
if (vae_wtype == GGML_TYPE_COUNT) { |
|
vae_wtype = wtype; |
|
} |
|
} else { |
|
model_wtype = wtype; |
|
conditioner_wtype = wtype; |
|
diffusion_model_wtype = wtype; |
|
vae_wtype = wtype; |
|
model_loader.set_wtype_override(wtype); |
|
} |
|
|
|
if (version == VERSION_SDXL) { |
|
vae_wtype = GGML_TYPE_F32; |
|
model_loader.set_wtype_override(GGML_TYPE_F32, "vae."); |
|
} |
|
|
|
LOG_INFO("Weight type: %s", model_wtype != SD_TYPE_COUNT ? ggml_type_name(model_wtype) : "??"); |
|
LOG_INFO("Conditioner weight type: %s", conditioner_wtype != SD_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??"); |
|
LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != SD_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??"); |
|
LOG_INFO("VAE weight type: %s", vae_wtype != SD_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??"); |
|
|
|
LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor)); |
|
|
|
if (version == VERSION_SDXL) { |
|
scale_factor = 0.13025f; |
|
if (vae_path.size() == 0 && taesd_path_fixed.size() == 0) { |
|
LOG_WARN( |
|
"!!!It looks like you are using SDXL model. " |
|
"If you find that the generated images are completely black, " |
|
"try specifying SDXL VAE FP16 Fix with the --vae parameter. " |
|
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors"); |
|
} |
|
} else if (sd_version_is_sd3(version)) { |
|
scale_factor = 1.5305f; |
|
} else if (sd_version_is_flux(version)) { |
|
scale_factor = 0.3611; |
|
|
|
} |
|
|
|
if (version == VERSION_SVD) { |
|
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types); |
|
clip_vision->alloc_params_buffer(); |
|
clip_vision->get_param_tensors(tensors); |
|
|
|
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version); |
|
diffusion_model->alloc_params_buffer(); |
|
diffusion_model->get_param_tensors(tensors); |
|
|
|
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, true, version); |
|
LOG_DEBUG("vae_decode_only %d", vae_decode_only); |
|
first_stage_model->alloc_params_buffer(); |
|
first_stage_model->get_param_tensors(tensors, "first_stage_model"); |
|
} else { |
|
clip_backend = backend; |
|
bool use_t5xxl = false; |
|
if (sd_version_is_dit(version)) { |
|
use_t5xxl = true; |
|
} |
|
if (!ggml_backend_is_cpu(backend) && use_t5xxl && conditioner_wtype != GGML_TYPE_F32) { |
|
clip_on_cpu = true; |
|
LOG_INFO("set clip_on_cpu to true"); |
|
} |
|
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) { |
|
LOG_INFO("CLIP: Using CPU backend"); |
|
clip_backend = ggml_backend_cpu_init(); |
|
} |
|
if (diffusion_flash_attn) { |
|
LOG_INFO("Using flash attention in the diffusion model"); |
|
} |
|
if (sd_version_is_sd3(version)) { |
|
if (diffusion_flash_attn) { |
|
LOG_WARN("flash attention in this diffusion model is currently unsupported!"); |
|
} |
|
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types); |
|
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types); |
|
} else if (sd_version_is_flux(version)) { |
|
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types); |
|
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn); |
|
} else { |
|
if (id_embeddings_path.find("v2") != std::string::npos) { |
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2); |
|
} else { |
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version); |
|
} |
|
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn); |
|
} |
|
|
|
cond_stage_model->alloc_params_buffer(); |
|
cond_stage_model->get_param_tensors(tensors); |
|
|
|
diffusion_model->alloc_params_buffer(); |
|
diffusion_model->get_param_tensors(tensors); |
|
|
|
if (!use_tiny_autoencoder) { |
|
if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { |
|
LOG_INFO("VAE Autoencoder: Using CPU backend"); |
|
vae_backend = ggml_backend_cpu_init(); |
|
} else { |
|
vae_backend = backend; |
|
} |
|
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version); |
|
first_stage_model->alloc_params_buffer(); |
|
first_stage_model->get_param_tensors(tensors, "first_stage_model"); |
|
} else { |
|
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); |
|
} |
|
|
|
|
|
if (control_net_path.size() > 0) { |
|
ggml_backend_t controlnet_backend = NULL; |
|
if (control_net_cpu && !ggml_backend_is_cpu(backend)) { |
|
LOG_DEBUG("ControlNet: Using CPU backend"); |
|
controlnet_backend = ggml_backend_cpu_init(); |
|
} else { |
|
controlnet_backend = backend; |
|
} |
|
control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version); |
|
} |
|
|
|
if (id_embeddings_path.find("v2") != std::string::npos) { |
|
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2); |
|
LOG_INFO("using PhotoMaker Version 2"); |
|
} else { |
|
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version); |
|
} |
|
if (id_embeddings_path.size() > 0) { |
|
pmid_lora = std::make_shared<LoraModel>(backend, id_embeddings_path, ""); |
|
if (!pmid_lora->load_from_file(true)) { |
|
LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str()); |
|
return false; |
|
} |
|
LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", id_embeddings_path.c_str()); |
|
if (!model_loader.init_from_file(id_embeddings_path, "pmid.")) { |
|
LOG_WARN("loading stacked ID embedding from '%s' failed", id_embeddings_path.c_str()); |
|
} else { |
|
stacked_id = true; |
|
} |
|
} |
|
if (stacked_id) { |
|
if (!pmid_model->alloc_params_buffer()) { |
|
LOG_ERROR(" pmid model params buffer allocation failed"); |
|
return false; |
|
} |
|
pmid_model->get_param_tensors(tensors, "pmid"); |
|
} |
|
} |
|
|
|
struct ggml_init_params params; |
|
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; |
|
params.mem_buffer = NULL; |
|
params.no_alloc = false; |
|
|
|
struct ggml_context* ctx = ggml_init(params); |
|
GGML_ASSERT(ctx != NULL); |
|
ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); |
|
calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); |
|
|
|
|
|
LOG_DEBUG("loading weights"); |
|
|
|
int64_t t0 = ggml_time_ms(); |
|
|
|
std::set<std::string> ignore_tensors; |
|
tensors["alphas_cumprod"] = alphas_cumprod_tensor; |
|
if (use_tiny_autoencoder) { |
|
ignore_tensors.insert("first_stage_model."); |
|
} |
|
if (stacked_id) { |
|
ignore_tensors.insert("lora."); |
|
} |
|
|
|
if (vae_decode_only) { |
|
ignore_tensors.insert("first_stage_model.encoder"); |
|
ignore_tensors.insert("first_stage_model.quant"); |
|
} |
|
if (version == VERSION_SVD) { |
|
ignore_tensors.insert("conditioner.embedders.3"); |
|
} |
|
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors); |
|
if (!success) { |
|
LOG_ERROR("load tensors from model loader failed"); |
|
ggml_free(ctx); |
|
return false; |
|
} |
|
|
|
|
|
|
|
if (version == VERSION_SVD) { |
|
|
|
|
|
|
|
} else { |
|
size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size(); |
|
size_t unet_params_mem_size = diffusion_model->get_params_buffer_size(); |
|
size_t vae_params_mem_size = 0; |
|
if (!use_tiny_autoencoder) { |
|
vae_params_mem_size = first_stage_model->get_params_buffer_size(); |
|
} else { |
|
if (!tae_first_stage->load_from_file(taesd_path_fixed)) { |
|
return false; |
|
} |
|
vae_params_mem_size = tae_first_stage->get_params_buffer_size(); |
|
} |
|
size_t control_net_params_mem_size = 0; |
|
if (control_net) { |
|
if (!control_net->load_from_file(control_net_path)) { |
|
return false; |
|
} |
|
control_net_params_mem_size = control_net->get_params_buffer_size(); |
|
} |
|
size_t pmid_params_mem_size = 0; |
|
if (stacked_id) { |
|
pmid_params_mem_size = pmid_model->get_params_buffer_size(); |
|
} |
|
|
|
size_t total_params_ram_size = 0; |
|
size_t total_params_vram_size = 0; |
|
if (ggml_backend_is_cpu(clip_backend)) { |
|
total_params_ram_size += clip_params_mem_size + pmid_params_mem_size; |
|
} else { |
|
total_params_vram_size += clip_params_mem_size + pmid_params_mem_size; |
|
} |
|
|
|
if (ggml_backend_is_cpu(backend)) { |
|
total_params_ram_size += unet_params_mem_size; |
|
} else { |
|
total_params_vram_size += unet_params_mem_size; |
|
} |
|
|
|
if (ggml_backend_is_cpu(vae_backend)) { |
|
total_params_ram_size += vae_params_mem_size; |
|
} else { |
|
total_params_vram_size += vae_params_mem_size; |
|
} |
|
|
|
if (ggml_backend_is_cpu(control_net_backend)) { |
|
total_params_ram_size += control_net_params_mem_size; |
|
} else { |
|
total_params_vram_size += control_net_params_mem_size; |
|
} |
|
|
|
size_t total_params_size = total_params_ram_size + total_params_vram_size; |
|
LOG_INFO( |
|
"total params memory size = %.2fMB (VRAM %.2fMB, RAM %.2fMB): " |
|
"clip %.2fMB(%s), unet %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), pmid %.2fMB(%s)", |
|
total_params_size / 1024.0 / 1024.0, |
|
total_params_vram_size / 1024.0 / 1024.0, |
|
total_params_ram_size / 1024.0 / 1024.0, |
|
clip_params_mem_size / 1024.0 / 1024.0, |
|
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM", |
|
unet_params_mem_size / 1024.0 / 1024.0, |
|
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", |
|
vae_params_mem_size / 1024.0 / 1024.0, |
|
ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM", |
|
control_net_params_mem_size / 1024.0 / 1024.0, |
|
ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM", |
|
pmid_params_mem_size / 1024.0 / 1024.0, |
|
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM"); |
|
} |
|
|
|
int64_t t1 = ggml_time_ms(); |
|
LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); |
|
|
|
|
|
bool is_using_v_parameterization = false; |
|
if (version == VERSION_SD2) { |
|
if (is_using_v_parameterization_for_sd2(ctx)) { |
|
is_using_v_parameterization = true; |
|
} |
|
} else if (version == VERSION_SVD) { |
|
|
|
is_using_v_parameterization = true; |
|
} |
|
|
|
if (sd_version_is_sd3(version)) { |
|
LOG_INFO("running in FLOW mode"); |
|
denoiser = std::make_shared<DiscreteFlowDenoiser>(); |
|
} else if (sd_version_is_flux(version)) { |
|
LOG_INFO("running in Flux FLOW mode"); |
|
float shift = 1.0f; |
|
for (auto pair : model_loader.tensor_storages_types) { |
|
if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) { |
|
shift = 1.15f; |
|
break; |
|
} |
|
} |
|
denoiser = std::make_shared<FluxFlowDenoiser>(shift); |
|
} else if (is_using_v_parameterization) { |
|
LOG_INFO("running in v-prediction mode"); |
|
denoiser = std::make_shared<CompVisVDenoiser>(); |
|
} else { |
|
LOG_INFO("running in eps-prediction mode"); |
|
} |
|
|
|
if (schedule != DEFAULT) { |
|
switch (schedule) { |
|
case DISCRETE: |
|
LOG_INFO("running with discrete schedule"); |
|
denoiser->schedule = std::make_shared<DiscreteSchedule>(); |
|
break; |
|
case KARRAS: |
|
LOG_INFO("running with Karras schedule"); |
|
denoiser->schedule = std::make_shared<KarrasSchedule>(); |
|
break; |
|
case EXPONENTIAL: |
|
LOG_INFO("running exponential schedule"); |
|
denoiser->schedule = std::make_shared<ExponentialSchedule>(); |
|
break; |
|
case AYS: |
|
LOG_INFO("Running with Align-Your-Steps schedule"); |
|
denoiser->schedule = std::make_shared<AYSSchedule>(); |
|
denoiser->schedule->version = version; |
|
break; |
|
case GITS: |
|
LOG_INFO("Running with GITS schedule"); |
|
denoiser->schedule = std::make_shared<GITSSchedule>(); |
|
denoiser->schedule->version = version; |
|
break; |
|
case DEFAULT: |
|
|
|
break; |
|
default: |
|
LOG_ERROR("Unknown schedule %i", schedule); |
|
abort(); |
|
} |
|
} |
|
|
|
auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser); |
|
if (comp_vis_denoiser) { |
|
for (int i = 0; i < TIMESTEPS; i++) { |
|
comp_vis_denoiser->sigmas[i] = std::sqrt((1 - ((float*)alphas_cumprod_tensor->data)[i]) / ((float*)alphas_cumprod_tensor->data)[i]); |
|
comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]); |
|
} |
|
} |
|
|
|
LOG_DEBUG("finished loaded file"); |
|
ggml_free(ctx); |
|
return true; |
|
} |
|
|
|
bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) { |
|
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); |
|
ggml_set_f32(x_t, 0.5); |
|
struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); |
|
ggml_set_f32(c, 0.5); |
|
|
|
struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); |
|
ggml_set_f32(timesteps, 999); |
|
int64_t t0 = ggml_time_ms(); |
|
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); |
|
diffusion_model->compute(n_threads, x_t, timesteps, c, NULL, NULL, NULL, -1, {}, 0.f, &out); |
|
diffusion_model->free_compute_buffer(); |
|
|
|
double result = 0.f; |
|
{ |
|
float* vec_x = (float*)x_t->data; |
|
float* vec_out = (float*)out->data; |
|
|
|
int64_t n = ggml_nelements(out); |
|
|
|
for (int i = 0; i < n; i++) { |
|
result += ((double)vec_out[i] - (double)vec_x[i]); |
|
} |
|
result /= n; |
|
} |
|
int64_t t1 = ggml_time_ms(); |
|
LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000); |
|
return result < -1; |
|
} |
|
|
|
void set_pending_lora(const std::string& lora_path, float multiplier) { |
|
pending_apply_lora_fname = lora_path; |
|
pending_apply_lora_power = multiplier; |
|
} |
|
|
|
void apply_lora_from_file(const std::string& lora_path, float multiplier) { |
|
int64_t t0 = ggml_time_ms(); |
|
std::string st_file_path = lora_path; |
|
std::string file_path; |
|
if (file_exists(st_file_path)) { |
|
file_path = st_file_path; |
|
} else { |
|
LOG_WARN("can not find %s for lora %s", st_file_path.c_str(), lora_path.c_str()); |
|
return; |
|
} |
|
LoraModel lora(backend, file_path); |
|
if (!lora.load_from_file()) { |
|
LOG_WARN("load lora tensors from %s failed", file_path.c_str()); |
|
return; |
|
} |
|
|
|
lora.multiplier = multiplier; |
|
lora.apply(tensors, n_threads); |
|
lora.free_params_buffer(); |
|
|
|
int64_t t1 = ggml_time_ms(); |
|
|
|
LOG_INFO("lora '%s' applied, taking %.2fs", |
|
lora_path.c_str(), |
|
(t1 - t0) * 1.0f / 1000); |
|
} |
|
|
|
void apply_lora(const std::string& lora_name, float multiplier) { |
|
int64_t t0 = ggml_time_ms(); |
|
std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); |
|
std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); |
|
std::string file_path; |
|
if (file_exists(st_file_path)) { |
|
file_path = st_file_path; |
|
} else if (file_exists(ckpt_file_path)) { |
|
file_path = ckpt_file_path; |
|
} else { |
|
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); |
|
return; |
|
} |
|
LoraModel lora(backend, file_path); |
|
if (!lora.load_from_file()) { |
|
LOG_WARN("load lora tensors from %s failed", file_path.c_str()); |
|
return; |
|
} |
|
|
|
lora.multiplier = multiplier; |
|
lora.apply(tensors, n_threads); |
|
lora.free_params_buffer(); |
|
|
|
int64_t t1 = ggml_time_ms(); |
|
|
|
LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000); |
|
} |
|
|
|
void apply_loras(const std::unordered_map<std::string, float>& lora_state) { |
|
if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) { |
|
LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); |
|
} |
|
std::unordered_map<std::string, float> lora_state_diff; |
|
for (auto& kv : lora_state) { |
|
const std::string& lora_name = kv.first; |
|
float multiplier = kv.second; |
|
|
|
if (curr_lora_state.find(lora_name) != curr_lora_state.end()) { |
|
float curr_multiplier = curr_lora_state[lora_name]; |
|
float multiplier_diff = multiplier - curr_multiplier; |
|
if (multiplier_diff != 0.f) { |
|
lora_state_diff[lora_name] = multiplier_diff; |
|
} |
|
} else { |
|
lora_state_diff[lora_name] = multiplier; |
|
} |
|
} |
|
|
|
LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size()); |
|
|
|
for (auto& kv : lora_state_diff) { |
|
apply_lora(kv.first, kv.second); |
|
} |
|
|
|
curr_lora_state = lora_state; |
|
} |
|
|
|
ggml_tensor* id_encoder(ggml_context* work_ctx, |
|
ggml_tensor* init_img, |
|
ggml_tensor* prompts_embeds, |
|
ggml_tensor* id_embeds, |
|
std::vector<bool>& class_tokens_mask) { |
|
ggml_tensor* res = NULL; |
|
pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx); |
|
return res; |
|
} |
|
|
|
SDCondition get_svd_condition(ggml_context* work_ctx, |
|
sd_image_t init_image, |
|
int width, |
|
int height, |
|
int fps = 6, |
|
int motion_bucket_id = 127, |
|
float augmentation_level = 0.f, |
|
bool force_zero_embeddings = false) { |
|
|
|
int64_t t0 = ggml_time_ms(); |
|
struct ggml_tensor* c_crossattn = NULL; |
|
{ |
|
if (force_zero_embeddings) { |
|
c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim); |
|
ggml_set_f32(c_crossattn, 0.f); |
|
} else { |
|
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); |
|
sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size); |
|
free(image.data); |
|
image.data = NULL; |
|
|
|
ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); |
|
sd_image_f32_to_tensor(resized_image.data, pixel_values, false); |
|
free(resized_image.data); |
|
resized_image.data = NULL; |
|
|
|
|
|
clip_vision->compute(n_threads, pixel_values, &c_crossattn, work_ctx); |
|
|
|
} |
|
} |
|
|
|
|
|
struct ggml_tensor* c_concat = NULL; |
|
{ |
|
if (force_zero_embeddings) { |
|
c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1); |
|
ggml_set_f32(c_concat, 0.f); |
|
} else { |
|
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); |
|
|
|
if (width != init_image.width || height != init_image.height) { |
|
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); |
|
sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height); |
|
free(image.data); |
|
image.data = NULL; |
|
sd_image_f32_to_tensor(resized_image.data, init_img, false); |
|
free(resized_image.data); |
|
resized_image.data = NULL; |
|
} else { |
|
sd_image_to_tensor(init_image.data, init_img); |
|
} |
|
if (augmentation_level > 0.f) { |
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); |
|
ggml_tensor_set_f32_randn(noise, rng); |
|
|
|
ggml_tensor_scale(noise, augmentation_level); |
|
ggml_tensor_add(init_img, noise); |
|
} |
|
ggml_tensor* moments = encode_first_stage(work_ctx, init_img); |
|
c_concat = get_first_stage_encoding(work_ctx, moments); |
|
} |
|
} |
|
|
|
|
|
struct ggml_tensor* y = NULL; |
|
{ |
|
y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); |
|
int out_dim = 256; |
|
int fps_id = fps - 1; |
|
std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level}; |
|
set_timestep_embedding(timesteps, y, out_dim); |
|
} |
|
int64_t t1 = ggml_time_ms(); |
|
LOG_DEBUG("computing svd condition graph completed, taking %d ms", (int)(t1 - t0)); |
|
return {c_crossattn, y, c_concat}; |
|
} |
|
|
|
ggml_tensor* sample(ggml_context* work_ctx, |
|
ggml_tensor* init_latent, |
|
ggml_tensor* noise, |
|
SDCondition cond, |
|
SDCondition uncond, |
|
ggml_tensor* control_hint, |
|
float control_strength, |
|
float min_cfg, |
|
float cfg_scale, |
|
float guidance, |
|
sample_method_t method, |
|
const std::vector<float>& sigmas, |
|
int start_merge_step, |
|
SDCondition id_cond, |
|
std::vector<int> skip_layers = {}, |
|
float slg_scale = 0, |
|
float skip_layer_start = 0.01, |
|
float skip_layer_end = 0.2) { |
|
size_t steps = sigmas.size() - 1; |
|
|
|
|
|
struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent); |
|
copy_ggml_tensor(x, init_latent); |
|
x = denoiser->noise_scaling(sigmas[0], noise, x); |
|
|
|
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); |
|
|
|
bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; |
|
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; |
|
|
|
|
|
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); |
|
struct ggml_tensor* out_uncond = NULL; |
|
struct ggml_tensor* out_skip = NULL; |
|
|
|
if (has_unconditioned) { |
|
out_uncond = ggml_dup_tensor(work_ctx, x); |
|
} |
|
if (has_skiplayer) { |
|
if (sd_version_is_dit(version)) { |
|
out_skip = ggml_dup_tensor(work_ctx, x); |
|
} else { |
|
has_skiplayer = false; |
|
LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); |
|
} |
|
} |
|
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); |
|
|
|
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { |
|
if (step == 1) { |
|
pretty_progress(0, (int)steps, 0); |
|
} |
|
int64_t t0 = ggml_time_us(); |
|
|
|
std::vector<float> scaling = denoiser->get_scalings(sigma); |
|
GGML_ASSERT(scaling.size() == 3); |
|
float c_skip = scaling[0]; |
|
float c_out = scaling[1]; |
|
float c_in = scaling[2]; |
|
|
|
float t = denoiser->sigma_to_t(sigma); |
|
std::vector<float> timesteps_vec(x->ne[3], t); |
|
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); |
|
std::vector<float> guidance_vec(x->ne[3], guidance); |
|
auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); |
|
|
|
copy_ggml_tensor(noised_input, input); |
|
|
|
ggml_tensor_scale(noised_input, c_in); |
|
|
|
std::vector<struct ggml_tensor*> controls; |
|
|
|
if (control_hint != NULL) { |
|
control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector); |
|
controls = control_net->controls; |
|
|
|
|
|
} |
|
|
|
if (start_merge_step == -1 || step <= start_merge_step) { |
|
|
|
diffusion_model->compute(n_threads, |
|
noised_input, |
|
timesteps, |
|
cond.c_crossattn, |
|
cond.c_concat, |
|
cond.c_vector, |
|
guidance_tensor, |
|
-1, |
|
controls, |
|
control_strength, |
|
&out_cond); |
|
} else { |
|
diffusion_model->compute(n_threads, |
|
noised_input, |
|
timesteps, |
|
id_cond.c_crossattn, |
|
cond.c_concat, |
|
id_cond.c_vector, |
|
guidance_tensor, |
|
-1, |
|
controls, |
|
control_strength, |
|
&out_cond); |
|
} |
|
|
|
float* negative_data = NULL; |
|
if (has_unconditioned) { |
|
|
|
if (control_hint != NULL) { |
|
control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); |
|
controls = control_net->controls; |
|
} |
|
diffusion_model->compute(n_threads, |
|
noised_input, |
|
timesteps, |
|
uncond.c_crossattn, |
|
uncond.c_concat, |
|
uncond.c_vector, |
|
guidance_tensor, |
|
-1, |
|
controls, |
|
control_strength, |
|
&out_uncond); |
|
negative_data = (float*)out_uncond->data; |
|
} |
|
|
|
int step_count = sigmas.size(); |
|
bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); |
|
float* skip_layer_data = NULL; |
|
if (is_skiplayer_step) { |
|
LOG_DEBUG("Skipping layers at step %d\n", step); |
|
|
|
diffusion_model->compute(n_threads, |
|
noised_input, |
|
timesteps, |
|
cond.c_crossattn, |
|
cond.c_concat, |
|
cond.c_vector, |
|
guidance_tensor, |
|
-1, |
|
controls, |
|
control_strength, |
|
&out_skip, |
|
NULL, |
|
skip_layers); |
|
skip_layer_data = (float*)out_skip->data; |
|
} |
|
float* vec_denoised = (float*)denoised->data; |
|
float* vec_input = (float*)input->data; |
|
float* positive_data = (float*)out_cond->data; |
|
int ne_elements = (int)ggml_nelements(denoised); |
|
for (int i = 0; i < ne_elements; i++) { |
|
float latent_result = positive_data[i]; |
|
if (has_unconditioned) { |
|
|
|
int64_t ne3 = out_cond->ne[3]; |
|
if (min_cfg != cfg_scale && ne3 != 1) { |
|
int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; |
|
float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); |
|
} else { |
|
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); |
|
} |
|
} |
|
if (is_skiplayer_step) { |
|
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; |
|
} |
|
|
|
|
|
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; |
|
} |
|
int64_t t1 = ggml_time_us(); |
|
if (step > 0) { |
|
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); |
|
|
|
} |
|
return denoised; |
|
}; |
|
|
|
sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng); |
|
|
|
x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); |
|
|
|
if (control_net) { |
|
control_net->free_control_ctx(); |
|
control_net->free_compute_buffer(); |
|
} |
|
diffusion_model->free_compute_buffer(); |
|
return x; |
|
} |
|
|
|
|
|
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { |
|
|
|
ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); |
|
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); |
|
ggml_tensor_set_f32_randn(noise, rng); |
|
|
|
{ |
|
float mean = 0; |
|
float logvar = 0; |
|
float value = 0; |
|
float std_ = 0; |
|
for (int i = 0; i < latent->ne[3]; i++) { |
|
for (int j = 0; j < latent->ne[2]; j++) { |
|
for (int k = 0; k < latent->ne[1]; k++) { |
|
for (int l = 0; l < latent->ne[0]; l++) { |
|
mean = ggml_tensor_get_f32(moments, l, k, j, i); |
|
logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); |
|
logvar = std::max(-30.0f, std::min(logvar, 20.0f)); |
|
std_ = std::exp(0.5f * logvar); |
|
value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); |
|
value = value * scale_factor; |
|
|
|
ggml_tensor_set_f32(latent, value, l, k, j, i); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
return latent; |
|
} |
|
|
|
ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { |
|
int64_t W = x->ne[0]; |
|
int64_t H = x->ne[1]; |
|
int64_t C = 8; |
|
if (use_tiny_autoencoder) { |
|
C = 4; |
|
} else { |
|
if (sd_version_is_sd3(version)) { |
|
C = 32; |
|
} else if (sd_version_is_flux(version)) { |
|
C = 32; |
|
} |
|
} |
|
ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, |
|
decode ? (W * 8) : (W / 8), |
|
decode ? (H * 8) : (H / 8), |
|
decode ? 3 : C, |
|
x->ne[3]); |
|
int64_t t0 = ggml_time_ms(); |
|
if (!use_tiny_autoencoder) { |
|
if (decode) { |
|
ggml_tensor_scale(x, 1.0f / scale_factor); |
|
} else { |
|
ggml_tensor_scale_input(x); |
|
} |
|
if (vae_tiling && decode) { |
|
|
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { |
|
first_stage_model->compute(n_threads, in, decode, &out); |
|
}; |
|
sd_tiling(x, result, 8, 32, 0.5f, on_tiling); |
|
} else { |
|
first_stage_model->compute(n_threads, x, decode, &result); |
|
} |
|
first_stage_model->free_compute_buffer(); |
|
if (decode) { |
|
ggml_tensor_scale_output(result); |
|
} |
|
} else { |
|
|
|
if (false && vae_tiling && decode) { |
|
|
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { |
|
tae_first_stage->compute(n_threads, in, decode, &out); |
|
}; |
|
sd_tiling(x, result, 8, 64, 0.5f, on_tiling); |
|
} else { |
|
tae_first_stage->compute(n_threads, x, decode, &result); |
|
} |
|
tae_first_stage->free_compute_buffer(); |
|
} |
|
|
|
int64_t t1 = ggml_time_ms(); |
|
LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000); |
|
if (decode) { |
|
ggml_tensor_clamp(result, 0.0f, 1.0f); |
|
} |
|
return result; |
|
} |
|
|
|
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { |
|
return compute_first_stage(work_ctx, x, false); |
|
} |
|
|
|
ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { |
|
return compute_first_stage(work_ctx, x, true); |
|
} |
|
}; |
|
|
|
|
|
|
|
struct sd_ctx_t { |
|
StableDiffusionGGML* sd = NULL; |
|
}; |
|
|
|
void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling) |
|
{ |
|
ctx->sd->vae_tiling = tiling; |
|
} |
|
|
|
int get_loaded_sd_version(sd_ctx_t* ctx) |
|
{ |
|
return ctx->sd->version; |
|
} |
|
|
|
sd_ctx_t* new_sd_ctx(const char* model_path_c_str, |
|
const char* clip_l_path_c_str, |
|
const char* clip_g_path_c_str, |
|
const char* t5xxl_path_c_str, |
|
const char* diffusion_model_path_c_str, |
|
const char* vae_path_c_str, |
|
const char* taesd_path_c_str, |
|
const char* control_net_path_c_str, |
|
const char* lora_model_dir_c_str, |
|
const char* embed_dir_c_str, |
|
const char* id_embed_dir_c_str, |
|
bool vae_decode_only, |
|
bool vae_tiling, |
|
bool free_params_immediately, |
|
int n_threads, |
|
enum sd_type_t wtype, |
|
enum rng_type_t rng_type, |
|
enum schedule_t s, |
|
bool keep_clip_on_cpu, |
|
bool keep_control_net_cpu, |
|
bool keep_vae_on_cpu, |
|
bool diffusion_flash_attn) { |
|
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); |
|
if (sd_ctx == NULL) { |
|
return NULL; |
|
} |
|
std::string model_path(model_path_c_str); |
|
std::string clip_l_path(clip_l_path_c_str); |
|
std::string clip_g_path(clip_g_path_c_str); |
|
std::string t5xxl_path(t5xxl_path_c_str); |
|
std::string diffusion_model_path(diffusion_model_path_c_str); |
|
std::string vae_path(vae_path_c_str); |
|
std::string taesd_path(taesd_path_c_str); |
|
std::string control_net_path(control_net_path_c_str); |
|
std::string embd_path(embed_dir_c_str); |
|
std::string id_embd_path(id_embed_dir_c_str); |
|
std::string lora_model_dir(lora_model_dir_c_str); |
|
|
|
sd_ctx->sd = new StableDiffusionGGML(n_threads, |
|
vae_decode_only, |
|
free_params_immediately, |
|
lora_model_dir, |
|
rng_type); |
|
if (sd_ctx->sd == NULL) { |
|
return NULL; |
|
} |
|
|
|
if (!sd_ctx->sd->load_from_file(model_path, |
|
clip_l_path, |
|
clip_g_path, |
|
t5xxl_path_c_str, |
|
diffusion_model_path, |
|
vae_path, |
|
control_net_path, |
|
embd_path, |
|
id_embd_path, |
|
taesd_path, |
|
vae_tiling, |
|
(ggml_type)wtype, |
|
s, |
|
keep_clip_on_cpu, |
|
keep_control_net_cpu, |
|
keep_vae_on_cpu, |
|
diffusion_flash_attn)) { |
|
delete sd_ctx->sd; |
|
sd_ctx->sd = NULL; |
|
free(sd_ctx); |
|
return NULL; |
|
} |
|
return sd_ctx; |
|
} |
|
|
|
void free_sd_ctx(sd_ctx_t* sd_ctx) { |
|
if (sd_ctx->sd != NULL) { |
|
delete sd_ctx->sd; |
|
sd_ctx->sd = NULL; |
|
} |
|
free(sd_ctx); |
|
} |
|
|
|
sd_image_t* generate_image(sd_ctx_t* sd_ctx, |
|
struct ggml_context* work_ctx, |
|
ggml_tensor* init_latent, |
|
std::string prompt, |
|
std::string negative_prompt, |
|
int clip_skip, |
|
float cfg_scale, |
|
float guidance, |
|
int width, |
|
int height, |
|
enum sample_method_t sample_method, |
|
const std::vector<float>& sigmas, |
|
int64_t seed, |
|
int batch_count, |
|
const sd_image_t* control_cond, |
|
float control_strength, |
|
float style_ratio, |
|
bool normalize_input, |
|
std::string input_id_images_path, |
|
std::vector<int> skip_layers = {}, |
|
float slg_scale = 0, |
|
float skip_layer_start = 0.01, |
|
float skip_layer_end = 0.2) { |
|
if (seed < 0) { |
|
|
|
|
|
|
|
srand((int)time(NULL)); |
|
seed = rand(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
int sample_steps = sigmas.size() - 1; |
|
|
|
|
|
auto result_pair = extract_and_remove_lora(prompt); |
|
std::unordered_map<std::string, float> lora_f2m = result_pair.first; |
|
|
|
for (auto& kv : lora_f2m) { |
|
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); |
|
} |
|
|
|
prompt = result_pair.second; |
|
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str()); |
|
|
|
int64_t t0 = ggml_time_ms(); |
|
|
|
if(pending_apply_lora_fname!="" && pending_apply_lora_power>0) |
|
{ |
|
printf("\nApplying LoRA now...\n"); |
|
sd_ctx->sd->apply_lora_from_file(pending_apply_lora_fname,pending_apply_lora_power); |
|
pending_apply_lora_fname = ""; |
|
} |
|
int64_t t1 = ggml_time_ms(); |
|
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); |
|
|
|
|
|
std::string prompt_text_only; |
|
ggml_tensor* init_img = NULL; |
|
SDCondition id_cond; |
|
std::vector<bool> class_tokens_mask; |
|
if (sd_ctx->sd->stacked_id) { |
|
if (!sd_ctx->sd->pmid_lora->applied) { |
|
t0 = ggml_time_ms(); |
|
sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->n_threads); |
|
t1 = ggml_time_ms(); |
|
sd_ctx->sd->pmid_lora->applied = true; |
|
LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); |
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->pmid_lora->free_params_buffer(); |
|
} |
|
} |
|
|
|
std::vector<sd_image_t*> input_id_images; |
|
bool pmv2 = sd_ctx->sd->pmid_model->get_version() == PM_VERSION_2; |
|
if (sd_ctx->sd->pmid_model && input_id_images_path.size() > 0) { |
|
std::vector<std::string> img_files = get_files_from_dir(input_id_images_path); |
|
for (std::string img_file : img_files) { |
|
int c = 0; |
|
int width, height; |
|
if (ends_with(img_file, "safetensors")) { |
|
continue; |
|
} |
|
uint8_t* input_image_buffer = stbi_load(img_file.c_str(), &width, &height, &c, 3); |
|
if (input_image_buffer == NULL) { |
|
LOG_ERROR("PhotoMaker load image from '%s' failed", img_file.c_str()); |
|
continue; |
|
} else { |
|
LOG_INFO("PhotoMaker loaded image from '%s'", img_file.c_str()); |
|
} |
|
sd_image_t* input_image = NULL; |
|
input_image = new sd_image_t{(uint32_t)width, |
|
(uint32_t)height, |
|
3, |
|
input_image_buffer}; |
|
input_image = preprocess_id_image(input_image); |
|
if (input_image == NULL) { |
|
LOG_ERROR("preprocess input id image from '%s' failed", img_file.c_str()); |
|
continue; |
|
} |
|
input_id_images.push_back(input_image); |
|
} |
|
} |
|
if (input_id_images.size() > 0) { |
|
sd_ctx->sd->pmid_model->style_strength = style_ratio; |
|
int32_t w = input_id_images[0]->width; |
|
int32_t h = input_id_images[0]->height; |
|
int32_t channels = input_id_images[0]->channel; |
|
int32_t num_input_images = (int32_t)input_id_images.size(); |
|
init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, w, h, channels, num_input_images); |
|
|
|
float mean[] = {0.48145466f, 0.4578275f, 0.40821073f}; |
|
float std[] = {0.26862954f, 0.26130258f, 0.27577711f}; |
|
for (int i = 0; i < num_input_images; i++) { |
|
sd_image_t* init_image = input_id_images[i]; |
|
if (normalize_input) |
|
sd_mul_images_to_tensor(init_image->data, init_img, i, mean, std); |
|
else |
|
sd_mul_images_to_tensor(init_image->data, init_img, i, NULL, NULL); |
|
} |
|
t0 = ggml_time_ms(); |
|
auto cond_tup = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx, |
|
sd_ctx->sd->n_threads, prompt, |
|
clip_skip, |
|
width, |
|
height, |
|
num_input_images, |
|
sd_ctx->sd->diffusion_model->get_adm_in_channels()); |
|
id_cond = std::get<0>(cond_tup); |
|
class_tokens_mask = std::get<1>(cond_tup); |
|
struct ggml_tensor* id_embeds = NULL; |
|
if (pmv2) { |
|
|
|
id_embeds = load_tensor_from_file(work_ctx, path_join(input_id_images_path, "id_embeds.bin")); |
|
|
|
} |
|
id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask); |
|
t1 = ggml_time_ms(); |
|
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); |
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->pmid_model->free_params_buffer(); |
|
} |
|
|
|
prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt); |
|
|
|
prompt = prompt_text_only; |
|
|
|
|
|
|
|
|
|
} else { |
|
LOG_WARN("Provided PhotoMaker model file, but NO input ID images"); |
|
LOG_WARN("Turn off PhotoMaker"); |
|
sd_ctx->sd->stacked_id = false; |
|
} |
|
for (sd_image_t* img : input_id_images) { |
|
free(img->data); |
|
} |
|
input_id_images.clear(); |
|
} |
|
|
|
|
|
t0 = ggml_time_ms(); |
|
SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, |
|
sd_ctx->sd->n_threads, |
|
prompt, |
|
clip_skip, |
|
width, |
|
height, |
|
sd_ctx->sd->diffusion_model->get_adm_in_channels()); |
|
|
|
SDCondition uncond; |
|
if (cfg_scale != 1.0) { |
|
bool force_zero_embeddings = false; |
|
if (sd_ctx->sd->version == VERSION_SDXL && negative_prompt.size() == 0) { |
|
force_zero_embeddings = true; |
|
} |
|
uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, |
|
sd_ctx->sd->n_threads, |
|
negative_prompt, |
|
clip_skip, |
|
width, |
|
height, |
|
sd_ctx->sd->diffusion_model->get_adm_in_channels(), |
|
force_zero_embeddings); |
|
} |
|
t1 = ggml_time_ms(); |
|
LOG_INFO("get_learned_condition completed, taking %d ms", t1 - t0); |
|
|
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->cond_stage_model->free_params_buffer(); |
|
} |
|
|
|
|
|
struct ggml_tensor* image_hint = NULL; |
|
if (control_cond != NULL) { |
|
image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); |
|
sd_image_to_tensor(control_cond->data, image_hint); |
|
} |
|
|
|
|
|
std::vector<struct ggml_tensor*> final_latents; |
|
int C = 4; |
|
if (sd_version_is_sd3(sd_ctx->sd->version)) { |
|
C = 16; |
|
} else if (sd_version_is_flux(sd_ctx->sd->version)) { |
|
C = 16; |
|
} |
|
int W = width / 8; |
|
int H = height / 8; |
|
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); |
|
for (int b = 0; b < batch_count; b++) { |
|
int64_t sampling_start = ggml_time_ms(); |
|
int64_t cur_seed = seed + b; |
|
LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); |
|
|
|
sd_ctx->sd->rng->manual_seed(cur_seed); |
|
struct ggml_tensor* x_t = init_latent; |
|
struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); |
|
ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); |
|
|
|
int start_merge_step = -1; |
|
if (sd_ctx->sd->stacked_id) { |
|
start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); |
|
|
|
|
|
LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); |
|
} |
|
|
|
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, |
|
x_t, |
|
noise, |
|
cond, |
|
uncond, |
|
image_hint, |
|
control_strength, |
|
cfg_scale, |
|
cfg_scale, |
|
guidance, |
|
sample_method, |
|
sigmas, |
|
start_merge_step, |
|
id_cond, |
|
skip_layers, |
|
slg_scale, |
|
skip_layer_start, |
|
skip_layer_end); |
|
|
|
|
|
int64_t sampling_end = ggml_time_ms(); |
|
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); |
|
final_latents.push_back(x_0); |
|
} |
|
|
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->diffusion_model->free_params_buffer(); |
|
} |
|
int64_t t3 = ggml_time_ms(); |
|
LOG_INFO("generating %d latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); |
|
|
|
|
|
LOG_INFO("decoding %zu latents", final_latents.size()); |
|
std::vector<struct ggml_tensor*> decoded_images; |
|
for (size_t i = 0; i < final_latents.size(); i++) { |
|
t1 = ggml_time_ms(); |
|
struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] ); |
|
|
|
if (img != NULL) { |
|
decoded_images.push_back(img); |
|
} |
|
int64_t t2 = ggml_time_ms(); |
|
LOG_INFO("latent %d decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); |
|
} |
|
|
|
int64_t t4 = ggml_time_ms(); |
|
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000); |
|
if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { |
|
sd_ctx->sd->first_stage_model->free_params_buffer(); |
|
} |
|
sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); |
|
if (result_images == NULL) { |
|
ggml_free(work_ctx); |
|
return NULL; |
|
} |
|
|
|
for (size_t i = 0; i < decoded_images.size(); i++) { |
|
result_images[i].width = width; |
|
result_images[i].height = height; |
|
result_images[i].channel = 3; |
|
result_images[i].data = sd_tensor_to_image(decoded_images[i]); |
|
} |
|
ggml_free(work_ctx); |
|
|
|
return result_images; |
|
} |
|
|
|
sd_image_t* txt2img(sd_ctx_t* sd_ctx, |
|
const char* prompt_c_str, |
|
const char* negative_prompt_c_str, |
|
int clip_skip, |
|
float cfg_scale, |
|
float guidance, |
|
int width, |
|
int height, |
|
enum sample_method_t sample_method, |
|
int sample_steps, |
|
int64_t seed, |
|
int batch_count, |
|
const sd_image_t* control_cond, |
|
float control_strength, |
|
float style_ratio, |
|
bool normalize_input, |
|
const char* input_id_images_path_c_str, |
|
int* skip_layers = NULL, |
|
size_t skip_layers_count = 0, |
|
float slg_scale = 0, |
|
float skip_layer_start = 0.01, |
|
float skip_layer_end = 0.2) { |
|
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count); |
|
LOG_DEBUG("txt2img %dx%d", width, height); |
|
if (sd_ctx == NULL) { |
|
return NULL; |
|
} |
|
|
|
struct ggml_init_params params; |
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); |
|
if (sd_version_is_sd3(sd_ctx->sd->version)) { |
|
params.mem_size *= 3; |
|
} |
|
if (sd_version_is_flux(sd_ctx->sd->version)) { |
|
params.mem_size *= 4; |
|
} |
|
if (sd_ctx->sd->stacked_id) { |
|
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); |
|
} |
|
params.mem_size += width * height * 3 * sizeof(float); |
|
params.mem_size *= batch_count; |
|
params.mem_buffer = NULL; |
|
params.no_alloc = false; |
|
|
|
|
|
struct ggml_context* work_ctx = ggml_init(params); |
|
if (!work_ctx) { |
|
LOG_ERROR("ggml_init() failed"); |
|
return NULL; |
|
} |
|
|
|
size_t t0 = ggml_time_ms(); |
|
|
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); |
|
|
|
int C = 4; |
|
if (sd_version_is_sd3(sd_ctx->sd->version)) { |
|
C = 16; |
|
} else if (sd_version_is_flux(sd_ctx->sd->version)) { |
|
C = 16; |
|
} |
|
int W = width / 8; |
|
int H = height / 8; |
|
ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); |
|
if (sd_version_is_sd3(sd_ctx->sd->version)) { |
|
ggml_set_f32(init_latent, 0.0609f); |
|
} else if (sd_version_is_flux(sd_ctx->sd->version)) { |
|
ggml_set_f32(init_latent, 0.1159f); |
|
} else { |
|
ggml_set_f32(init_latent, 0.f); |
|
} |
|
|
|
sd_image_t* result_images = generate_image(sd_ctx, |
|
work_ctx, |
|
init_latent, |
|
prompt_c_str, |
|
negative_prompt_c_str, |
|
clip_skip, |
|
cfg_scale, |
|
guidance, |
|
width, |
|
height, |
|
sample_method, |
|
sigmas, |
|
seed, |
|
batch_count, |
|
control_cond, |
|
control_strength, |
|
style_ratio, |
|
normalize_input, |
|
input_id_images_path_c_str, |
|
skip_layers_vec, |
|
slg_scale, |
|
skip_layer_start, |
|
skip_layer_end); |
|
|
|
size_t t1 = ggml_time_ms(); |
|
|
|
LOG_INFO("txt2img completed in %.2fs", (t1 - t0) * 1.0f / 1000); |
|
|
|
return result_images; |
|
} |
|
|
|
sd_image_t* img2img(sd_ctx_t* sd_ctx, |
|
sd_image_t init_image, |
|
const char* prompt_c_str, |
|
const char* negative_prompt_c_str, |
|
int clip_skip, |
|
float cfg_scale, |
|
float guidance, |
|
int width, |
|
int height, |
|
sample_method_t sample_method, |
|
int sample_steps, |
|
float strength, |
|
int64_t seed, |
|
int batch_count, |
|
const sd_image_t* control_cond, |
|
float control_strength, |
|
float style_ratio, |
|
bool normalize_input, |
|
const char* input_id_images_path_c_str, |
|
int* skip_layers = NULL, |
|
size_t skip_layers_count = 0, |
|
float slg_scale = 0, |
|
float skip_layer_start = 0.01, |
|
float skip_layer_end = 0.2) { |
|
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count); |
|
LOG_DEBUG("img2img %dx%d", width, height); |
|
if (sd_ctx == NULL) { |
|
return NULL; |
|
} |
|
|
|
struct ggml_init_params params; |
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); |
|
if (sd_version_is_sd3(sd_ctx->sd->version)) { |
|
params.mem_size *= 2; |
|
} |
|
if (sd_version_is_flux(sd_ctx->sd->version)) { |
|
params.mem_size *= 3; |
|
} |
|
if (sd_ctx->sd->stacked_id) { |
|
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); |
|
} |
|
params.mem_size += width * height * 3 * sizeof(float) * 2; |
|
params.mem_size *= batch_count; |
|
params.mem_buffer = NULL; |
|
params.no_alloc = false; |
|
|
|
|
|
struct ggml_context* work_ctx = ggml_init(params); |
|
if (!work_ctx) { |
|
LOG_ERROR("ggml_init() failed"); |
|
return NULL; |
|
} |
|
|
|
size_t t0 = ggml_time_ms(); |
|
|
|
if (seed < 0) { |
|
srand((int)time(NULL)); |
|
seed = rand(); |
|
} |
|
sd_ctx->sd->rng->manual_seed(seed); |
|
|
|
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); |
|
sd_image_to_tensor(init_image.data, init_img); |
|
ggml_tensor* init_latent = NULL; |
|
if (!sd_ctx->sd->use_tiny_autoencoder) { |
|
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); |
|
init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); |
|
} else { |
|
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); |
|
} |
|
|
|
size_t t1 = ggml_time_ms(); |
|
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); |
|
|
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); |
|
size_t t_enc = static_cast<size_t>(sample_steps * strength); |
|
LOG_INFO("target t_enc is %zu steps", t_enc); |
|
std::vector<float> sigma_sched; |
|
sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); |
|
|
|
sd_image_t* result_images = generate_image(sd_ctx, |
|
work_ctx, |
|
init_latent, |
|
prompt_c_str, |
|
negative_prompt_c_str, |
|
clip_skip, |
|
cfg_scale, |
|
guidance, |
|
width, |
|
height, |
|
sample_method, |
|
sigma_sched, |
|
seed, |
|
batch_count, |
|
control_cond, |
|
control_strength, |
|
style_ratio, |
|
normalize_input, |
|
input_id_images_path_c_str, |
|
skip_layers_vec, |
|
slg_scale, |
|
skip_layer_start, |
|
skip_layer_end); |
|
|
|
size_t t2 = ggml_time_ms(); |
|
|
|
LOG_INFO("img2img completed in %.2fs", (t1 - t0) * 1.0f / 1000); |
|
|
|
return result_images; |
|
} |
|
|
|
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, |
|
sd_image_t init_image, |
|
int width, |
|
int height, |
|
int video_frames, |
|
int motion_bucket_id, |
|
int fps, |
|
float augmentation_level, |
|
float min_cfg, |
|
float cfg_scale, |
|
enum sample_method_t sample_method, |
|
int sample_steps, |
|
float strength, |
|
int64_t seed) { |
|
if (sd_ctx == NULL) { |
|
return NULL; |
|
} |
|
|
|
LOG_INFO("img2vid %dx%d", width, height); |
|
|
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); |
|
|
|
struct ggml_init_params params; |
|
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; |
|
params.mem_size += width * height * 3 * sizeof(float) * video_frames; |
|
params.mem_buffer = NULL; |
|
params.no_alloc = false; |
|
|
|
|
|
|
|
struct ggml_context* work_ctx = ggml_init(params); |
|
if (!work_ctx) { |
|
LOG_ERROR("ggml_init() failed"); |
|
return NULL; |
|
} |
|
|
|
if (seed < 0) { |
|
seed = (int)time(NULL); |
|
} |
|
|
|
sd_ctx->sd->rng->manual_seed(seed); |
|
|
|
int64_t t0 = ggml_time_ms(); |
|
|
|
SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx, |
|
init_image, |
|
width, |
|
height, |
|
fps, |
|
motion_bucket_id, |
|
augmentation_level); |
|
|
|
auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn); |
|
ggml_set_f32(uc_crossattn, 0.f); |
|
|
|
auto uc_concat = ggml_dup_tensor(work_ctx, cond.c_concat); |
|
ggml_set_f32(uc_concat, 0.f); |
|
|
|
auto uc_vector = ggml_dup_tensor(work_ctx, cond.c_vector); |
|
|
|
SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat); |
|
|
|
int64_t t1 = ggml_time_ms(); |
|
LOG_INFO("get_learned_condition completed, taking %d ms", t1 - t0); |
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->clip_vision->free_params_buffer(); |
|
} |
|
|
|
sd_ctx->sd->rng->manual_seed(seed); |
|
int C = 4; |
|
int W = width / 8; |
|
int H = height / 8; |
|
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames); |
|
ggml_set_f32(x_t, 0.f); |
|
|
|
struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames); |
|
ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); |
|
|
|
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); |
|
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, |
|
x_t, |
|
noise, |
|
cond, |
|
uncond, |
|
{}, |
|
0.f, |
|
min_cfg, |
|
cfg_scale, |
|
0.f, |
|
sample_method, |
|
sigmas, |
|
-1, |
|
SDCondition(NULL, NULL, NULL)); |
|
|
|
int64_t t2 = ggml_time_ms(); |
|
LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); |
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->diffusion_model->free_params_buffer(); |
|
} |
|
|
|
struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); |
|
if (sd_ctx->sd->free_params_immediately) { |
|
sd_ctx->sd->first_stage_model->free_params_buffer(); |
|
} |
|
if (img == NULL) { |
|
ggml_free(work_ctx); |
|
return NULL; |
|
} |
|
|
|
sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t)); |
|
if (result_images == NULL) { |
|
ggml_free(work_ctx); |
|
return NULL; |
|
} |
|
|
|
for (size_t i = 0; i < video_frames; i++) { |
|
auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i); |
|
|
|
result_images[i].width = width; |
|
result_images[i].height = height; |
|
result_images[i].channel = 3; |
|
result_images[i].data = sd_tensor_to_image(img_i); |
|
} |
|
ggml_free(work_ctx); |
|
|
|
int64_t t3 = ggml_time_ms(); |
|
|
|
LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000); |
|
|
|
return result_images; |
|
} |
|
|