import random import unittest import numpy as np import torch # torch_device, # {{ edit_1 }} Removed unused import from transformers import ( AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel, ) from diffusers import ( AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxControlNetInpaintPipeline, FluxControlNetModel, FluxTransformer2DModel, ) from diffusers.utils.testing_utils import ( enable_full_determinism, floats_tensor, ) from ..test_pipelines_common import PipelineTesterMixin enable_full_determinism() class FluxControlNetInpaintPipelineTests(unittest.TestCase, PipelineTesterMixin): pipeline_class = FluxControlNetInpaintPipeline params = frozenset( [ "prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds", "image", "mask_image", "control_image", "strength", "num_inference_steps", "controlnet_conditioning_scale", ] ) batch_params = frozenset(["prompt", "image", "mask_image", "control_image"]) test_xformers_attention = False def get_dummy_components(self): torch.manual_seed(0) transformer = FluxTransformer2DModel( patch_size=1, in_channels=8, num_layers=1, num_single_layers=1, attention_head_dim=16, num_attention_heads=2, joint_attention_dim=32, pooled_projection_dim=32, axes_dims_rope=[4, 4, 8], ) clip_text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, hidden_size=32, intermediate_size=37, layer_norm_eps=1e-05, num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, vocab_size=1000, hidden_act="gelu", projection_dim=32, ) torch.manual_seed(0) text_encoder = CLIPTextModel(clip_text_encoder_config) torch.manual_seed(0) text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") torch.manual_seed(0) vae = AutoencoderKL( sample_size=32, in_channels=3, out_channels=3, block_out_channels=(4,), layers_per_block=1, latent_channels=2, norm_num_groups=1, use_quant_conv=False, use_post_quant_conv=False, shift_factor=0.0609, scaling_factor=1.5035, ) torch.manual_seed(0) controlnet = FluxControlNetModel( patch_size=1, in_channels=8, num_layers=1, num_single_layers=1, attention_head_dim=16, num_attention_heads=2, joint_attention_dim=32, pooled_projection_dim=32, axes_dims_rope=[4, 4, 8], ) scheduler = FlowMatchEulerDiscreteScheduler() return { "scheduler": scheduler, "text_encoder": text_encoder, "text_encoder_2": text_encoder_2, "tokenizer": tokenizer, "tokenizer_2": tokenizer_2, "transformer": transformer, "vae": vae, "controlnet": controlnet, } def get_dummy_inputs(self, device, seed=0): if str(device).startswith("mps"): generator = torch.manual_seed(seed) else: generator = torch.Generator(device=device).manual_seed(seed) image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) mask_image = torch.ones((1, 1, 32, 32)).to(device) control_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) inputs = { "prompt": "A painting of a squirrel eating a burger", "image": image, "mask_image": mask_image, "control_image": control_image, "generator": generator, "num_inference_steps": 2, "guidance_scale": 5.0, "height": 32, "width": 32, "max_sequence_length": 48, "strength": 0.8, "output_type": "np", } return inputs def test_flux_controlnet_inpaint_with_num_images_per_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(device) pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) inputs["num_images_per_prompt"] = 2 output = pipe(**inputs) images = output.images assert images.shape == (2, 32, 32, 3) def test_flux_controlnet_inpaint_with_controlnet_conditioning_scale(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(device) pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) output_default = pipe(**inputs) image_default = output_default.images inputs["controlnet_conditioning_scale"] = 0.5 output_scaled = pipe(**inputs) image_scaled = output_scaled.images # Ensure that changing the controlnet_conditioning_scale produces a different output assert not np.allclose(image_default, image_scaled, atol=0.01) def test_attention_slicing_forward_pass(self): super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3)