File size: 14,776 Bytes
fa9e87f
 
 
 
 
 
 
c77b704
 
 
 
 
 
 
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
e71f892
 
fa9e87f
 
 
 
e71f892
fa9e87f
 
e71f892
 
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
 
 
 
 
fa9e87f
 
 
 
 
 
e71f892
fa9e87f
 
e71f892
 
 
 
 
 
 
fa9e87f
 
 
 
 
e71f892
 
 
fa9e87f
 
 
e71f892
 
 
 
 
 
 
 
fa9e87f
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
 
fa9e87f
e71f892
 
fa9e87f
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
fa9e87f
 
 
 
 
 
 
 
 
e71f892
 
fa9e87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e71f892
 
 
 
 
 
fa9e87f
 
 
e71f892
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa9e87f
 
 
 
e71f892
 
 
 
 
 
fa9e87f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
---
license: mit
tags:
- pytorch
- stable-diffusion
- text2Image
- stabilityai/stable-diffusion-2-1
datasets:
- xchuan/text2image-fupo
language:
- en
base_model:
- stabilityai/stable-diffusion-2-1
pipeline_tag: text-to-image
---

# This LoRA is trained based on stabilityai/stable-diffusion-2-1.

## Training code

```python
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from datasets import load_dataset

dataset = load_dataset("xchuan/text2image-fupo",split="train")

from transformers import CLIPTokenizer
from huggingface_hub import login
# ========== LoRA 模型库 ==========
from peft import LoraConfig, get_peft_model, PeftModel


login(token="替换为你自己的",add_to_git_credential=True)

weight_dtype = torch.bfloat16
train_batch_size = 4
snr_gamma = 5  # SNR 参数,用于信噪比加权损失的调节系数
# 设置随机数种子以确保可重复性
seed = 1126  # 随机数种子
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# 优化器参数
unet_learning_rate = 1e-6  # UNet 的学习率,控制 UNet 参数更新的步长
text_encoder_learning_rate = 1e-4  # 文本编码器的学习率,控制文本嵌入层的参数更新步长

# 学习率调度器参数
lr_scheduler_name = "cosine_with_restarts"  # 设置学习率调度器为 Cosine annealing with restarts,逐渐减少学习率并定期重启
lr_warmup_steps = 100  # 学习率预热步数,在最初的 100 步中逐渐增加学习率到最大值
max_train_steps = 500  # 总训练步数,决定了整个训练过程的迭代次数
num_cycles = 1  # Cosine 调度器的周期数量,在训练期间会重复 3 次学习率周期性递减并重启

pretrained_model_name_or_path = "stabilityai/stable-diffusion-2-1"

# LoRA 配置
unet_lora_config = LoraConfig(
    r=32,  # LoRA 的秩,即低秩矩阵的维度,决定了参数调整的自由度
    lora_alpha=16,  # 缩放系数,控制 LoRA 权重对模型的影响
    init_lora_weights="gaussian",
    target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    lora_dropout=0  # LoRA dropout 概率,0 表示不使用 dropout
)

from torchvision import transforms
from torch.utils.data import DataLoader

resolution = 512


train_transform = transforms.Compose([
    transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR),  # 调整图像大小
        transforms.CenterCrop(resolution),  # 中心裁剪图像
        transforms.RandomHorizontalFlip(),  # 随机水平翻转
        transforms.ToTensor(),  # 将图像转换为张量
])

def collate_fn(examples):
    pixel_values = []
    input_ids = []

    for example in examples:
        image_tensor = train_transform(example["image"])
        if not isinstance(image_tensor, torch.Tensor):
            print(f"Expected Tensor, got {type(image_tensor)} instead.")
            continue
        pixel_values.append(image_tensor)

        input_text = "fupo:" + example["text"]
        tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
        encode_text = tokenizer(input_text, return_tensors="pt",padding="max_length",truncation=True)
        inputs_id = encode_text["input_ids"].squeeze(0)
        input_ids.append(inputs_id)
    
        # 如果没有有效的图像,则返回空的字典
    if len(pixel_values) == 0:
        return {"pixel_values": torch.empty(0), "input_ids": torch.empty(0)}
    
    pixel_values = torch.stack(pixel_values, dim=0).float()
    input_ids = torch.stack(input_ids, dim=0)
    return {"pixel_values": pixel_values, "input_ids": input_ids}


train_dataloader = DataLoader(dataset, shuffle=True, collate_fn=collate_fn, batch_size=train_batch_size)

from diffusers import SD3Transformer2DModel

def prepare_lora_model(unet_lora_config, pretrained_model_name_or_path, model_path=None, resume=False, merge_lora=False):
    """
    (1) 目标:
        - 加载完整的 Stable Diffusion 模型,包括 LoRA 层,并根据需要合并 LoRA 权重。这包括 Tokenizer、噪声调度器、UNet、VAE 和文本编码器。

    (2) 参数:
        - unet_lora_config: LoraConfig, LoRA 的配置对象
        - pretrained_model_name_or_path: str, Hugging Face 上的模型名称或路径
        - model_path: str, 预训练模型的路径
        - resume: bool, 是否从上一次训练中恢复
        - merge_lora: bool, 是否在推理时合并 LoRA 权重

    (3) 返回:
        - tokenizer: CLIPTokenizer
        - noise_scheduler: DDPMScheduler
        - unet: UNet2DConditionModel
        - vae: AutoencoderKL
        - text_encoder: CLIPTextModel
    """
    # 加载噪声调度器,用于控制扩散模型的噪声添加和移除过程
    noise_scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")

    # 加载 Tokenizer,用于将文本标注转换为 tokens
    tokenizer = CLIPTokenizer.from_pretrained(
        pretrained_model_name_or_path,
        subfolder="tokenizer"
    )

    # 加载 CLIP 文本编码器,用于将文本标注转换为特征向量
    text_encoder = CLIPTextModel.from_pretrained(
        pretrained_model_name_or_path,
        torch_dtype=weight_dtype,
        subfolder="text_encoder"
    )

    # 加载 VAE 模型,用于在扩散模型中处理图像的潜在表示
    vae = AutoencoderKL.from_pretrained(
        pretrained_model_name_or_path,
        subfolder="vae"
    )

    # 加载 UNet 模型,负责处理扩散模型中的图像生成和推理过程
    unet = UNet2DConditionModel.from_pretrained(
        pretrained_model_name_or_path,
        torch_dtype=weight_dtype,
        subfolder="unet"
    )

    # 冻结 VAE 参数
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)
    unet.requires_grad_(False)
    
    # 如果设置为继续训练,则加载上一次的模型权重
    if resume:
        if model_path is None or not os.path.exists(model_path):
            raise ValueError("当 resume 设置为 True 时,必须提供有效的 model_path")
        # 使用 PEFT 的 from_pretrained 方法加载 LoRA 模型
        # text_encoder = PeftModel.from_pretrained(text_encoder, os.path.join(model_path, "text_encoder"))
        unet = PeftModel.from_pretrained(unet, os.path.join(model_path, "unet"))

        # 确保 LoRA 参数是可训练的,仅将指定的模块参数设为可训练
        target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
    
        for name, param in unet.named_parameters():
            # 只对指定的目标模块设置 requires_grad 为 True
            if any(target_module in name for target_module in target_modules):
                param.requires_grad = True  # 仅将 LoRA 参数设为可训练
        
                
        print(f"✅ 已从 {model_path} 恢复模型权重")

    else:

        # 将 LoRA 配置应用到unet
        unet.add_adapter(unet_lora_config)

        # 打印可训练参数数量
        print("📊 UNet 可训练参数:")
        trainable_params = 0
        for name, param in unet.named_parameters():
            if param.requires_grad:
                param_count = param.numel()  # 计算该参数张量的元素数量
                trainable_params += param_count
                # print(f"可训练参数: {name}, 形状: {param.shape}, 参数数量: {param_count}")

        print(f"总的 LoRA 可训练参数数量: {trainable_params}")
    
    if merge_lora:
        # 合并 LoRA 权重到基础模型,仅在推理时调用
        # text_encoder = text_encoder.merge_and_unload()
        unet = unet.merge_and_unload()

        # 切换为评估模式
        text_encoder.eval()
        unet.eval()

    # 将模型移动到 GPU 上并设置权重的数据类型
    unet.to(device, dtype=weight_dtype)
    vae.to(device, dtype=weight_dtype)
    text_encoder.to(device, dtype=weight_dtype)
    
    return tokenizer, noise_scheduler, unet, vae, text_encoder

def prepare_optimizer(unet, text_encoder, unet_learning_rate=5e-4, text_encoder_learning_rate=1e-4):
    # 筛选出 UNet 中需要训练的 Lora 层参数
    unet_lora_layers = [p for p in unet.parameters() if p.requires_grad]
    
    # 将需要训练的参数分组并设置不同的学习率
    trainable_params = [
        {"params": unet_lora_layers, "lr": unet_learning_rate},
    ]
    
    # 使用 AdamW 优化器
    optimizer = torch.optim.AdamW(trainable_params)
    
    return optimizer

import os
from diffusers.optimization import get_scheduler
from diffusers.training_utils import compute_snr
from diffusers import DDPMScheduler,AutoencoderKL,UNet2DConditionModel
from transformers import CLIPTextModel

project_name = "fupo"
dataset_name = "fupo"
# 根目录和主要目录
root_dir = "./"  # 当前目录
main_dir = os.path.join(root_dir, "SD-2-1")  # 主目录
# 项目目录
project_dir = os.path.join(main_dir, project_name)
model_path = os.path.join(project_dir, "logs", "checkpoint-last")

# 项目目录
project_dir = os.path.join(main_dir, project_name)
model_path = os.path.join(project_dir, "logs", "checkpoint-last")

# 准备模型
tokenizer, noise_scheduler, unet, vae, text_encoder = prepare_lora_model(
    unet_lora_config,
    pretrained_model_name_or_path,
    model_path,
    resume=False,
    merge_lora=False
)

# 准备优化器
optimizer = prepare_optimizer(
    unet, 
    text_encoder, 
    unet_learning_rate=unet_learning_rate, 
    text_encoder_learning_rate=text_encoder_learning_rate
)

# 设置学习率调度器
lr_scheduler = get_scheduler(
    lr_scheduler_name,
    optimizer=optimizer,
    num_warmup_steps=lr_warmup_steps,
    num_training_steps=max_train_steps,
    num_cycles=num_cycles
)

print("✅ 模型和优化器准备完成!可以开始训练。")

import math
from huggingface_hub import HfApi, Repository
from tqdm.auto import tqdm
import torch.nn.functional as F
from peft.utils import get_peft_model_state_dict
from diffusers.utils import convert_state_dict_to_diffusers

accumulation_steps = 4  # 梯度累积步数
max_norm = 0.5
output_folder = os.path.join(project_dir, "logs")
# 禁用并行化,避免警告
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 初始化
global_step = 0
best_loss = float("inf")  # 初始化为正无穷大,存储最佳损失值

# 进度条显示训练进度
progress_bar = tqdm(
    range(max_train_steps),  # 根据 num_training_steps 设置
    desc="训练步骤",
)

# 训练循环
for epoch in range(math.ceil(max_train_steps / len(train_dataloader))):
    # 如果你想在训练中增加评估,那在循环中增加 train() 是有必要的
    unet.train()
    
    for step, batch in enumerate(train_dataloader):
        if global_step >= max_train_steps:
            break
        
        # 编码图像为潜在表示(latent)
        latents = vae.encode(batch["pixel_values"].to(device, dtype=weight_dtype)).latent_dist.sample()
        latents = latents * vae.config.scaling_factor  # 根据 VAE 的缩放因子调整潜在空间

        # 为潜在表示添加噪声,生成带噪声的图像
        noise = torch.randn_like(latents)  # 生成与潜在表示相同形状的随机噪声
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (latents.shape[0],), device=device).long()
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        # 获取文本的嵌入表示
        encoder_hidden_states = text_encoder(batch["input_ids"].to(device),return_dict=False)[0]
        assert encoder_hidden_states is not None, "Encoder hidden states should not be None"

        # 计算目标值
        if noise_scheduler.config.prediction_type == "epsilon":
            target = noise  # 预测噪声
        elif noise_scheduler.config.prediction_type == "v_prediction":
            target = noise_scheduler.get_velocity(latents, noise, timesteps)  # 预测速度向量

        # UNet 模型预测
        with torch.autograd.detect_anomaly():
            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
        assert model_pred is not None, "Model prediction should not be None"

        # 计算损失
        if not snr_gamma:
            loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
        else:
            # 计算信噪比 (SNR) 并根据 SNR 加权 MSE 损失
            snr = compute_snr(noise_scheduler, timesteps)
            mse_loss_weights = torch.stack([snr, snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0]
            if noise_scheduler.config.prediction_type == "epsilon":
                mse_loss_weights = mse_loss_weights / snr
            elif noise_scheduler.config.prediction_type == "v_prediction":
                mse_loss_weights = mse_loss_weights / (snr + 1)
            
            # 计算加权的 MSE 损失
            loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
            loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
            loss = loss.mean()

        # 反向传播
        loss.backward()
        torch.nn.utils.clip_grad_norm_(unet.parameters(), max_norm)
        # 梯度累积
        if (global_step + 1) % accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        progress_bar.update(1)
        global_step += 1

        if global_step %100 == 0:
            # 保存当前损失最低的模型
            if loss.item() < best_loss:
                best_loss = loss.item()
                save_path = os.path.join(output_folder, "best_checkpoint")
                os.makedirs(save_path, exist_ok=True)

                # 使用 save_pretrained 保存 PeftModel
                unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
                StableDiffusionPipeline.save_lora_weights(
                    save_directory=save_path,
                    unet_lora_layers=unet_lora_state_dict,
                    safe_serialization=True,
                    )
                # text_encoder.save_pretrained(os.path.join(save_path, "text_encoder"))
                print(f"💾 损失最小模型已保存到 {save_path}, 当前损失: {best_loss}")

# 保存最终模型到 checkpoint-last
save_path = os.path.join(output_folder, "checkpoint-last")
os.makedirs(save_path, exist_ok=True)
unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
StableDiffusionPipeline.save_lora_weights(
    save_directory=save_path,
    unet_lora_layers=unet_lora_state_dict,
    safe_serialization=True,
    )
print(f"💾 已保存最终模型到 {save_path}")

```