Spaces:

nvidia
/

Cosmos-Predict2

Running on Zero

File size: 6,381 Bytes

22a452a

import gc
import unittest

import numpy as np
import torch
from transformers import AutoTokenizer

from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
from diffusers.utils.testing_utils import (
    Expectations,
    backend_empty_cache,
    numpy_cosine_similarity_distance,
    require_torch_accelerator,
    slow,
    torch_device,
)

from ..test_pipelines_common import PipelineTesterMixin


class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
    pipeline_class = OmniGenPipeline
    params = frozenset(["prompt", "guidance_scale"])
    batch_params = frozenset(["prompt"])

    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

        transformer = OmniGenTransformer2DModel(
            hidden_size=16,
            num_attention_heads=4,
            num_key_value_heads=4,
            intermediate_size=32,
            num_layers=1,
            in_channels=4,
            time_step_dim=4,
            rope_scaling={"long_factor": list(range(1, 3)), "short_factor": list(range(1, 3))},
        )

        torch.manual_seed(0)
        vae = AutoencoderKL(
            sample_size=32,
            in_channels=3,
            out_channels=3,
            block_out_channels=(4, 4, 4, 4),
            layers_per_block=1,
            latent_channels=4,
            norm_num_groups=1,
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
        )

        scheduler = FlowMatchEulerDiscreteScheduler(invert_sigmas=True, num_train_timesteps=1)
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

        components = {
            "transformer": transformer,
            "vae": vae,
            "scheduler": scheduler,
            "tokenizer": tokenizer,
        }
        return components

    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device="cpu").manual_seed(seed)

        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            "generator": generator,
            "num_inference_steps": 1,
            "guidance_scale": 3.0,
            "output_type": "np",
            "height": 16,
            "width": 16,
        }
        return inputs

    def test_inference(self):
        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)

        inputs = self.get_dummy_inputs(torch_device)
        generated_image = pipe(**inputs).images[0]

        self.assertEqual(generated_image.shape, (16, 16, 3))


@slow
@require_torch_accelerator
class OmniGenPipelineSlowTests(unittest.TestCase):
    pipeline_class = OmniGenPipeline
    repo_id = "shitao/OmniGen-v1-diffusers"

    def setUp(self):
        super().setUp()
        gc.collect()
        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device="cpu").manual_seed(seed)

        return {
            "prompt": "A photo of a cat",
            "num_inference_steps": 2,
            "guidance_scale": 2.5,
            "output_type": "np",
            "generator": generator,
        }

    def test_omnigen_inference(self):
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
        pipe.enable_model_cpu_offload()

        inputs = self.get_inputs(torch_device)

        image = pipe(**inputs).images[0]
        image_slice = image[0, :10, :10]

        expected_slices = Expectations(
            {
                ("xpu", 3): np.array(
                    [
                        [0.05859375, 0.05859375, 0.04492188],
                        [0.04882812, 0.04101562, 0.03320312],
                        [0.04882812, 0.04296875, 0.03125],
                        [0.04296875, 0.0390625, 0.03320312],
                        [0.04296875, 0.03710938, 0.03125],
                        [0.04492188, 0.0390625, 0.03320312],
                        [0.04296875, 0.03710938, 0.03125],
                        [0.04101562, 0.03710938, 0.02734375],
                        [0.04101562, 0.03515625, 0.02734375],
                        [0.04101562, 0.03515625, 0.02929688],
                    ],
                    dtype=np.float32,
                ),
                ("cuda", 7): np.array(
                    [
                        [0.1783447, 0.16772744, 0.14339337],
                        [0.17066911, 0.15521264, 0.13757327],
                        [0.17072496, 0.15531206, 0.13524258],
                        [0.16746324, 0.1564025, 0.13794944],
                        [0.16490817, 0.15258026, 0.13697758],
                        [0.16971767, 0.15826806, 0.13928896],
                        [0.16782972, 0.15547255, 0.13783783],
                        [0.16464645, 0.15281534, 0.13522372],
                        [0.16535294, 0.15301755, 0.13526791],
                        [0.16365296, 0.15092957, 0.13443318],
                    ],
                    dtype=np.float32,
                ),
                ("cuda", 8): np.array(
                    [
                        [0.0546875, 0.05664062, 0.04296875],
                        [0.046875, 0.04101562, 0.03320312],
                        [0.05078125, 0.04296875, 0.03125],
                        [0.04296875, 0.04101562, 0.03320312],
                        [0.0390625, 0.03710938, 0.02929688],
                        [0.04296875, 0.03710938, 0.03125],
                        [0.0390625, 0.03710938, 0.02929688],
                        [0.0390625, 0.03710938, 0.02734375],
                        [0.0390625, 0.03320312, 0.02734375],
                        [0.0390625, 0.03320312, 0.02734375],
                    ],
                    dtype=np.float32,
                ),
            }
        )
        expected_slice = expected_slices.get_expectation()

        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

        assert max_diff < 1e-4