File size: 30,797 Bytes
fcc02a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
import copy
import os
import random
from collections import OrderedDict
from typing import Union

from PIL import Image
from diffusers import T2IAdapter
from torchvision.transforms import transforms
from tqdm import tqdm

from toolkit.basic import value_map
from toolkit.config_modules import SliderConfig
from toolkit.data_transfer_object.data_loader import DataLoaderBatchDTO
from toolkit.sd_device_states_presets import get_train_sd_device_state_preset
from toolkit.train_tools import get_torch_dtype, apply_snr_weight, apply_learnable_snr_gos
import gc
from toolkit import train_tools
from toolkit.prompt_utils import \
    EncodedPromptPair, ACTION_TYPES_SLIDER, \
    EncodedAnchor, concat_prompt_pairs, \
    concat_anchors, PromptEmbedsCache, encode_prompts_to_cache, build_prompt_pair_batch_from_cache, split_anchors, \
    split_prompt_pairs

import torch
from .BaseSDTrainProcess import BaseSDTrainProcess


def flush():
    torch.cuda.empty_cache()
    gc.collect()


adapter_transforms = transforms.Compose([
    transforms.ToTensor(),
])


class TrainSliderProcess(BaseSDTrainProcess):
    def __init__(self, process_id: int, job, config: OrderedDict):
        super().__init__(process_id, job, config)
        self.prompt_txt_list = None
        self.step_num = 0
        self.start_step = 0
        self.device = self.get_conf('device', self.job.device)
        self.device_torch = torch.device(self.device)
        self.slider_config = SliderConfig(**self.get_conf('slider', {}))
        self.prompt_cache = PromptEmbedsCache()
        self.prompt_pairs: list[EncodedPromptPair] = []
        self.anchor_pairs: list[EncodedAnchor] = []
        # keep track of prompt chunk size
        self.prompt_chunk_size = 1

        # check if we have more targets than steps
        # this can happen because of permutation son shuffling
        if len(self.slider_config.targets) > self.train_config.steps:
            # trim targets
            self.slider_config.targets = self.slider_config.targets[:self.train_config.steps]

        # get presets
        self.eval_slider_device_state = get_train_sd_device_state_preset(
            self.device_torch,
            train_unet=False,
            train_text_encoder=False,
            cached_latents=self.is_latents_cached,
            train_lora=False,
            train_adapter=False,
            train_embedding=False,
        )

        self.train_slider_device_state = get_train_sd_device_state_preset(
            self.device_torch,
            train_unet=self.train_config.train_unet,
            train_text_encoder=False,
            cached_latents=self.is_latents_cached,
            train_lora=True,
            train_adapter=False,
            train_embedding=False,
        )

    def before_model_load(self):
        pass

    def hook_before_train_loop(self):

        # read line by line from file
        if self.slider_config.prompt_file:
            self.print(f"Loading prompt file from {self.slider_config.prompt_file}")
            with open(self.slider_config.prompt_file, 'r', encoding='utf-8') as f:
                self.prompt_txt_list = f.readlines()
                # clean empty lines
                self.prompt_txt_list = [line.strip() for line in self.prompt_txt_list if len(line.strip()) > 0]

            self.print(f"Found {len(self.prompt_txt_list)} prompts.")

            if not self.slider_config.prompt_tensors:
                print(f"Prompt tensors not found. Building prompt tensors for {self.train_config.steps} steps.")
                # shuffle
                random.shuffle(self.prompt_txt_list)
                # trim to max steps
                self.prompt_txt_list = self.prompt_txt_list[:self.train_config.steps]
                # trim list to our max steps

        cache = PromptEmbedsCache()
        print(f"Building prompt cache")

        # get encoded latents for our prompts
        with torch.no_grad():
            # list of neutrals. Can come from file or be empty
            neutral_list = self.prompt_txt_list if self.prompt_txt_list is not None else [""]

            # build the prompts to cache
            prompts_to_cache = []
            for neutral in neutral_list:
                for target in self.slider_config.targets:
                    prompt_list = [
                        f"{target.target_class}",  # target_class
                        f"{target.target_class} {neutral}",  # target_class with neutral
                        f"{target.positive}",  # positive_target
                        f"{target.positive} {neutral}",  # positive_target with neutral
                        f"{target.negative}",  # negative_target
                        f"{target.negative} {neutral}",  # negative_target with neutral
                        f"{neutral}",  # neutral
                        f"{target.positive} {target.negative}",  # both targets
                        f"{target.negative} {target.positive}",  # both targets reverse
                    ]
                    prompts_to_cache += prompt_list

            # remove duplicates
            prompts_to_cache = list(dict.fromkeys(prompts_to_cache))

            # trim to max steps if max steps is lower than prompt count
            # todo, this can break if we have more targets than steps, should be fixed, by reducing permuations, but could stil happen with low steps
            # prompts_to_cache = prompts_to_cache[:self.train_config.steps]

            # encode them
            cache = encode_prompts_to_cache(
                prompt_list=prompts_to_cache,
                sd=self.sd,
                cache=cache,
                prompt_tensor_file=self.slider_config.prompt_tensors
            )

            prompt_pairs = []
            prompt_batches = []
            for neutral in tqdm(neutral_list, desc="Building Prompt Pairs", leave=False):
                for target in self.slider_config.targets:
                    prompt_pair_batch = build_prompt_pair_batch_from_cache(
                        cache=cache,
                        target=target,
                        neutral=neutral,

                    )
                    if self.slider_config.batch_full_slide:
                        # concat the prompt pairs
                        # this allows us to run the entire 4 part process in one shot (for slider)
                        self.prompt_chunk_size = 4
                        concat_prompt_pair_batch = concat_prompt_pairs(prompt_pair_batch).to('cpu')
                        prompt_pairs += [concat_prompt_pair_batch]
                    else:
                        self.prompt_chunk_size = 1
                        # do them one at a time (probably not necessary after new optimizations)
                        prompt_pairs += [x.to('cpu') for x in prompt_pair_batch]

            # setup anchors
            anchor_pairs = []
            for anchor in self.slider_config.anchors:
                # build the cache
                for prompt in [
                    anchor.prompt,
                    anchor.neg_prompt  # empty neutral
                ]:
                    if cache[prompt] == None:
                        cache[prompt] = self.sd.encode_prompt(prompt)

                anchor_batch = []
                # we get the prompt pair multiplier from first prompt pair
                # since they are all the same. We need to match their network polarity
                prompt_pair_multipliers = prompt_pairs[0].multiplier_list
                for prompt_multiplier in prompt_pair_multipliers:
                    # match the network multiplier polarity
                    anchor_scalar = 1.0 if prompt_multiplier > 0 else -1.0
                    anchor_batch += [
                        EncodedAnchor(
                            prompt=cache[anchor.prompt],
                            neg_prompt=cache[anchor.neg_prompt],
                            multiplier=anchor.multiplier * anchor_scalar
                        )
                    ]

                anchor_pairs += [
                    concat_anchors(anchor_batch).to('cpu')
                ]
            if len(anchor_pairs) > 0:
                self.anchor_pairs = anchor_pairs

        # move to cpu to save vram
        # We don't need text encoder anymore, but keep it on cpu for sampling
        # if text encoder is list
        if isinstance(self.sd.text_encoder, list):
            for encoder in self.sd.text_encoder:
                encoder.to("cpu")
        else:
            self.sd.text_encoder.to("cpu")
        self.prompt_cache = cache
        self.prompt_pairs = prompt_pairs
        # self.anchor_pairs = anchor_pairs
        flush()
        if self.data_loader is not None:
            # we will have images, prep the vae
            self.sd.vae.eval()
            self.sd.vae.to(self.device_torch)
        # end hook_before_train_loop

    def before_dataset_load(self):
        if self.slider_config.use_adapter == 'depth':
            print(f"Loading T2I Adapter for depth")
            # called before LoRA network is loaded but after model is loaded
            # attach the adapter here so it is there before we load the network
            adapter_path = 'TencentARC/t2iadapter_depth_sd15v2'
            if self.model_config.is_xl:
                adapter_path = 'TencentARC/t2i-adapter-depth-midas-sdxl-1.0'

            print(f"Loading T2I Adapter from {adapter_path}")

            # dont name this adapter since we are not training it
            self.t2i_adapter = T2IAdapter.from_pretrained(
                adapter_path, torch_dtype=get_torch_dtype(self.train_config.dtype), varient="fp16"
            ).to(self.device_torch)
            self.t2i_adapter.eval()
            self.t2i_adapter.requires_grad_(False)
            flush()

    @torch.no_grad()
    def get_adapter_images(self, batch: Union[None, 'DataLoaderBatchDTO']):

        img_ext_list = ['.jpg', '.jpeg', '.png', '.webp']
        adapter_folder_path = self.slider_config.adapter_img_dir
        adapter_images = []
        # loop through images
        for file_item in batch.file_items:
            img_path = file_item.path
            file_name_no_ext = os.path.basename(img_path).split('.')[0]
            # find the image
            for ext in img_ext_list:
                if os.path.exists(os.path.join(adapter_folder_path, file_name_no_ext + ext)):
                    adapter_images.append(os.path.join(adapter_folder_path, file_name_no_ext + ext))
                    break
        width, height = batch.file_items[0].crop_width, batch.file_items[0].crop_height
        adapter_tensors = []
        # load images with torch transforms
        for idx, adapter_image in enumerate(adapter_images):
            # we need to centrally crop the largest dimension of the image to match the batch shape after scaling
            # to the smallest dimension
            img: Image.Image = Image.open(adapter_image)
            if img.width > img.height:
                # scale down so height is the same as batch
                new_height = height
                new_width = int(img.width * (height / img.height))
            else:
                new_width = width
                new_height = int(img.height * (width / img.width))

            img = img.resize((new_width, new_height))
            crop_fn = transforms.CenterCrop((height, width))
            # crop the center to match batch
            img = crop_fn(img)
            img = adapter_transforms(img)
            adapter_tensors.append(img)

        # stack them
        adapter_tensors = torch.stack(adapter_tensors).to(
            self.device_torch, dtype=get_torch_dtype(self.train_config.dtype)
        )
        return adapter_tensors

    def hook_train_loop(self, batch: Union['DataLoaderBatchDTO', None]):
        if isinstance(batch, list):
            batch = batch[0]
        # set to eval mode
        self.sd.set_device_state(self.eval_slider_device_state)
        with torch.no_grad():
            dtype = get_torch_dtype(self.train_config.dtype)

            # get a random pair
            prompt_pair: EncodedPromptPair = self.prompt_pairs[
                torch.randint(0, len(self.prompt_pairs), (1,)).item()
            ]
            # move to device and dtype
            prompt_pair.to(self.device_torch, dtype=dtype)

            # get a random resolution
            height, width = self.slider_config.resolutions[
                torch.randint(0, len(self.slider_config.resolutions), (1,)).item()
            ]
            if self.train_config.gradient_checkpointing:
                # may get disabled elsewhere
                self.sd.unet.enable_gradient_checkpointing()

        noise_scheduler = self.sd.noise_scheduler
        optimizer = self.optimizer
        lr_scheduler = self.lr_scheduler

        loss_function = torch.nn.MSELoss()

        pred_kwargs = {}

        def get_noise_pred(neg, pos, gs, cts, dn):
            down_kwargs = copy.deepcopy(pred_kwargs)
            if 'down_block_additional_residuals' in down_kwargs:
                dbr_batch_size = down_kwargs['down_block_additional_residuals'][0].shape[0]
                if dbr_batch_size != dn.shape[0]:
                    amount_to_add = int(dn.shape[0] * 2 / dbr_batch_size)
                    down_kwargs['down_block_additional_residuals'] = [
                        torch.cat([sample.clone()] * amount_to_add) for sample in
                        down_kwargs['down_block_additional_residuals']
                    ]
            return self.sd.predict_noise(
                latents=dn,
                text_embeddings=train_tools.concat_prompt_embeddings(
                    neg,  # negative prompt
                    pos,  # positive prompt
                    self.train_config.batch_size,
                ),
                timestep=cts,
                guidance_scale=gs,
                **down_kwargs
            )

        with torch.no_grad():
            adapter_images = None
            self.sd.unet.eval()

            # for a complete slider, the batch size is 4 to begin with now
            true_batch_size = prompt_pair.target_class.text_embeds.shape[0] * self.train_config.batch_size
            from_batch = False
            if batch is not None:
                # traing from a batch of images, not generating ourselves
                from_batch = True
                noisy_latents, noise, timesteps, conditioned_prompts, imgs = self.process_general_training_batch(batch)
                if self.slider_config.adapter_img_dir is not None:
                    adapter_images = self.get_adapter_images(batch)
                    adapter_strength_min = 0.9
                    adapter_strength_max = 1.0

                    def rand_strength(sample):
                        adapter_conditioning_scale = torch.rand(
                            (1,), device=self.device_torch, dtype=dtype
                        )

                        adapter_conditioning_scale = value_map(
                            adapter_conditioning_scale,
                            0.0,
                            1.0,
                            adapter_strength_min,
                            adapter_strength_max
                        )
                        return sample.to(self.device_torch, dtype=dtype).detach() * adapter_conditioning_scale

                    down_block_additional_residuals = self.t2i_adapter(adapter_images)
                    down_block_additional_residuals = [
                        rand_strength(sample) for sample in down_block_additional_residuals
                    ]
                    pred_kwargs['down_block_additional_residuals'] = down_block_additional_residuals

                # denoised_latents = torch.cat([noisy_latents] * self.prompt_chunk_size, dim=0)
                denoised_latents = noisy_latents
                current_timestep = timesteps
            else:
                if self.train_config.noise_scheduler == 'flowmatch':
                    linear_timesteps = any([
                        self.train_config.linear_timesteps,
                        self.train_config.linear_timesteps2,
                        self.train_config.timestep_type == 'linear',
                    ])
                    
                    timestep_type = 'linear' if linear_timesteps else None
                    if timestep_type is None:
                        timestep_type = self.train_config.timestep_type
                    
                    # make fake latents
                    l = torch.randn(
                        true_batch_size, 16, height, width
                    ).to(self.device_torch, dtype=dtype)
                    
                    self.sd.noise_scheduler.set_train_timesteps(
                        self.train_config.max_denoising_steps,
                        device=self.device_torch,
                        timestep_type=timestep_type,
                        latents=l
                    )
                else:
                    self.sd.noise_scheduler.set_timesteps(
                        self.train_config.max_denoising_steps, device=self.device_torch
                    )

                # ger a random number of steps
                timesteps_to = torch.randint(
                    1, self.train_config.max_denoising_steps - 1, (1,)
                ).item()

                # get noise
                noise = self.sd.get_latent_noise(
                    pixel_height=height,
                    pixel_width=width,
                    batch_size=true_batch_size,
                    noise_offset=self.train_config.noise_offset,
                ).to(self.device_torch, dtype=dtype)

                # get latents
                latents = noise * self.sd.noise_scheduler.init_noise_sigma
                latents = latents.to(self.device_torch, dtype=dtype)

                assert not self.network.is_active
                self.sd.unet.eval()
                # pass the multiplier list to the network
                # double up since we are doing cfg
                self.network.multiplier = prompt_pair.multiplier_list + prompt_pair.multiplier_list
                denoised_latents = self.sd.diffuse_some_steps(
                    latents,  # pass simple noise latents
                    prompt_pair.target_class,
                    start_timesteps=0,
                    total_timesteps=timesteps_to,
                    guidance_scale=3,
                    bypass_guidance_embedding=False
                )
                if hasattr(self.sd.noise_scheduler, 'set_train_timesteps'):
                    noise_scheduler.set_train_timesteps(1000, device=self.device_torch)
                else:
                    noise_scheduler.set_timesteps(1000)

                current_timestep_index = int(timesteps_to * 1000 / self.train_config.max_denoising_steps)
                current_timestep = noise_scheduler.timesteps[current_timestep_index]

            # split the latents into out prompt pair chunks
            # denoised_latent_chunks = torch.chunk(denoised_latents, self.prompt_chunk_size, dim=0)
            # denoised_latent_chunks = [x.detach() for x in denoised_latent_chunks]
            denoised_latent_chunks = [denoised_latents]

            # flush()  # 4.2GB to 3GB on 512x512
            mask_multiplier = torch.ones((denoised_latents.shape[0], 1, 1, 1), device=self.device_torch, dtype=dtype)
            has_mask = False
            if batch and batch.mask_tensor is not None:
                with self.timer('get_mask_multiplier'):
                    # upsampling no supported for bfloat16
                    mask_multiplier = batch.mask_tensor.to(self.device_torch, dtype=torch.float16).detach()
                    # scale down to the size of the latents, mask multiplier shape(bs, 1, width, height), noisy_latents shape(bs, channels, width, height)
                    mask_multiplier = torch.nn.functional.interpolate(
                        mask_multiplier, size=(noisy_latents.shape[2], noisy_latents.shape[3])
                    )
                    # expand to match latents
                    mask_multiplier = mask_multiplier.expand(-1, noisy_latents.shape[1], -1, -1)
                    mask_multiplier = mask_multiplier.to(self.device_torch, dtype=dtype).detach()
                    has_mask = True

            if has_mask:
                unmasked_target = get_noise_pred(
                    prompt_pair.positive_target,  # negative prompt
                    prompt_pair.target_class,  # positive prompt
                    1,
                    current_timestep,
                    denoised_latents
                )
                unmasked_target = unmasked_target.detach()
                unmasked_target.requires_grad = False
            else:
                unmasked_target = None

            # 4.20 GB RAM for 512x512
            # positive_latents = get_noise_pred(
            #     prompt_pair.positive_target,  # negative prompt
            #     prompt_pair.negative_target,  # positive prompt
            #     1,
            #     current_timestep,
            #     denoised_latents
            # )
            # positive_latents = positive_latents.detach()
            # positive_latents.requires_grad = False

            # neutral_latents = get_noise_pred(
            #     prompt_pair.positive_target,  # negative prompt
            #     prompt_pair.empty_prompt,  # positive prompt (normally neutral
            #     1,
            #     current_timestep,
            #     denoised_latents
            # )
            # neutral_latents = neutral_latents.detach()
            # neutral_latents.requires_grad = False

            # unconditional_latents = get_noise_pred(
            #     prompt_pair.positive_target,  # negative prompt
            #     prompt_pair.positive_target,  # positive prompt
            #     1,
            #     current_timestep,
            #     denoised_latents
            # )
            # unconditional_latents = unconditional_latents.detach()
            # unconditional_latents.requires_grad = False
            
            # we just need positive target, negative target, and empty prompt to calculate all
            # since we are in no grad, we can easily do it in a single step
            embeddings = train_tools.concat_prompt_embeddings(
                prompt_pair.positive_target,
                prompt_pair.empty_prompt,
                1
            )
            embeddings = train_tools.concat_prompt_embeddings(
                embeddings,
                prompt_pair.negative_target,
                1
            )
            all_pred = self.sd.predict_noise(
                latents=torch.cat([denoised_latents] * 3, dim=0),
                text_embeddings=embeddings,
                timestep=torch.cat([current_timestep] * 3, dim=0),
            )
            all_pred = all_pred.detach()
            all_pred.requires_grad = False
            positive_pred, neutral_pred, unconditional_pred = torch.chunk(all_pred, 3, dim=0)
            
            # doing them backward here as it was originally for erasing
            positive_latents = unconditional_pred
            neutral_latents = neutral_pred
            unconditional_latents = positive_pred
            

            denoised_latents = denoised_latents.detach()

        self.sd.set_device_state(self.train_slider_device_state)
        self.sd.unet.train()
        # start accumulating gradients
        self.optimizer.zero_grad(set_to_none=True)

        anchor_loss_float = None
        
        with torch.no_grad():
            if self.slider_config.low_ram:
                prompt_pair_chunks = split_prompt_pairs(prompt_pair.detach(), self.prompt_chunk_size)
                denoised_latent_chunks = denoised_latent_chunks  # just to have it in one place
                positive_latents_chunks = torch.chunk(positive_latents.detach(), self.prompt_chunk_size, dim=0)
                neutral_latents_chunks = torch.chunk(neutral_latents.detach(), self.prompt_chunk_size, dim=0)
                unconditional_latents_chunks = torch.chunk(
                    unconditional_latents.detach(),
                    self.prompt_chunk_size,
                    dim=0
                )
                mask_multiplier_chunks = torch.chunk(mask_multiplier, self.prompt_chunk_size, dim=0)
                if unmasked_target is not None:
                    unmasked_target_chunks = torch.chunk(unmasked_target, self.prompt_chunk_size, dim=0)
                else:
                    unmasked_target_chunks = [None for _ in range(self.prompt_chunk_size)]
            else:
                # run through in one instance
                prompt_pair_chunks = [prompt_pair.detach()]
                denoised_latent_chunks = [torch.cat(denoised_latent_chunks, dim=0).detach()]
                positive_latents_chunks = [positive_latents.detach()]
                neutral_latents_chunks = [neutral_latents.detach()]
                unconditional_latents_chunks = [unconditional_latents.detach()]
                mask_multiplier_chunks = [mask_multiplier]
                unmasked_target_chunks = [unmasked_target]

            # flush()
        assert len(prompt_pair_chunks) == len(denoised_latent_chunks)
        # 3.28 GB RAM for 512x512
        with self.network:
            assert self.network.is_active
            loss_list = []
            for prompt_pair_chunk, \
                    denoised_latent_chunk, \
                    positive_latents_chunk, \
                    neutral_latents_chunk, \
                    unconditional_latents_chunk, \
                    mask_multiplier_chunk, \
                    unmasked_target_chunk \
                    in zip(
                prompt_pair_chunks,
                denoised_latent_chunks,
                positive_latents_chunks,
                neutral_latents_chunks,
                unconditional_latents_chunks,
                mask_multiplier_chunks,
                unmasked_target_chunks
            ):
                self.network.multiplier = prompt_pair_chunk.multiplier_list
                
                target_latents = self.sd.predict_noise(
                    latents=denoised_latent_chunk.detach(),
                    text_embeddings=prompt_pair_chunk.target_class,
                    timestep=current_timestep,
                )

                guidance_scale = 1.0

                offset = guidance_scale * (positive_latents_chunk - unconditional_latents_chunk)

                # make offset multiplier based on actions
                offset_multiplier_list = []
                for action in prompt_pair_chunk.action_list:
                    if action == ACTION_TYPES_SLIDER.ERASE_NEGATIVE:
                        offset_multiplier_list += [-1.0]
                    elif action == ACTION_TYPES_SLIDER.ENHANCE_NEGATIVE:
                        offset_multiplier_list += [1.0]

                offset_multiplier = torch.tensor(offset_multiplier_list).to(offset.device, dtype=offset.dtype)
                # make offset multiplier match rank of offset
                offset_multiplier = offset_multiplier.view(offset.shape[0], 1, 1, 1)
                offset *= offset_multiplier

                offset_neutral = neutral_latents_chunk
                # offsets are already adjusted on a per-batch basis
                offset_neutral += offset
                offset_neutral = offset_neutral.detach().requires_grad_(False)

                # 16.15 GB RAM for 512x512 -> 4.20GB RAM for 512x512 with new grad_checkpointing
                loss = torch.nn.functional.mse_loss(target_latents.float(), offset_neutral.float(), reduction="none")

                # do inverted mask to preserve non masked
                if has_mask and unmasked_target_chunk is not None:
                    loss = loss * mask_multiplier_chunk
                    # match the mask unmasked_target_chunk
                    mask_target_loss = torch.nn.functional.mse_loss(
                        target_latents.float(),
                        unmasked_target_chunk.float(),
                        reduction="none"
                    )
                    mask_target_loss = mask_target_loss * (1.0 - mask_multiplier_chunk)
                    loss += mask_target_loss

                loss = loss.mean([1, 2, 3])

                if self.train_config.learnable_snr_gos:
                    if from_batch:
                        # match batch size
                        loss = apply_snr_weight(loss, timesteps, self.sd.noise_scheduler,
                                                self.train_config.min_snr_gamma)
                    else:
                        # match batch size
                        timesteps_index_list = [current_timestep_index for _ in range(target_latents.shape[0])]
                        # add snr_gamma
                        loss = apply_learnable_snr_gos(loss, timesteps_index_list, self.snr_gos)
                if self.train_config.min_snr_gamma is not None and self.train_config.min_snr_gamma > 0.000001:
                    if from_batch:
                        # match batch size
                        loss = apply_snr_weight(loss, timesteps, self.sd.noise_scheduler,
                                                self.train_config.min_snr_gamma)
                    else:
                        # match batch size
                        timesteps_index_list = [current_timestep_index for _ in range(target_latents.shape[0])]
                        # add min_snr_gamma
                        loss = apply_snr_weight(loss, timesteps_index_list, noise_scheduler,
                                                self.train_config.min_snr_gamma)


                loss = loss.mean() * prompt_pair_chunk.weight

                loss.backward()
                loss_list.append(loss.item())
                del target_latents
                del offset_neutral
                del loss
                # flush()

        optimizer.step()
        lr_scheduler.step()

        loss_float = sum(loss_list) / len(loss_list)
        if anchor_loss_float is not None:
            loss_float += anchor_loss_float

        del (
            positive_latents,
            neutral_latents,
            unconditional_latents,
            # latents
        )
        # move back to cpu
        prompt_pair.to("cpu")
        # flush()

        # reset network
        self.network.multiplier = 1.0

        loss_dict = OrderedDict(
            {'loss': loss_float},
        )
        if anchor_loss_float is not None:
            loss_dict['sl_l'] = loss_float
            loss_dict['an_l'] = anchor_loss_float

        return loss_dict
        # end hook_train_loop