Spaces:

DDCM
/

DDCM-Compressed-Image-Generation

Running on Zero

File size: 15,249 Bytes

b273838

import gradio as gr
from functools import partial
import torch
import spaces

import DDCM_blind_face_image_restoration
import latent_DDCM_CCFG
import latent_DDCM_compression
from latent_models import load_model
import os
# import transformers
# transformers.utils.move_cache()


if os.getenv("SPACES_ZERO_GPU") == "true":
    os.environ["SPACES_ZERO_GPU"] = "1"


avail_models = {'512x512': load_model('stabilityai/stable-diffusion-2-1-base', 1000, float16=True, device=torch.device("cpu"), compile=False)[0],
                '768x768': load_model('stabilityai/stable-diffusion-2-1', 1000, float16=True, device=torch.device("cpu"), compile=False)[0]
               }

compression_func = partial(latent_DDCM_compression.main, avail_models=avail_models)


def get_t_and_k_from_file_name(file_name):
    T = int(file_name.split('T')[1].split('-')[0])
    K = int(file_name.split('K')[1].split('-')[0])
    model_type = file_name.split('M')[1].split('-')[0]
    return T, K, model_type


def ccfg(text_input, T, K, ccfg_scale, model_type, compressed_file_in=None):
    return latent_DDCM_CCFG.main(text_input, T, K, min(ccfg_scale, K), model_type, compressed_file_in,
                                 avail_models=avail_models)
    # return latent_DDCM_CCFG.main(text_input, T, K, min(ccfg_scale, K), compressed_file_in)


@spaces.GPU
def decompress_given_bitstream(bitstream, method):
    if bitstream is None:
        gr.Error("Please provide a bit-stream file when performing decompression")
    file_name = bitstream.name
    T, K, model_type = get_t_and_k_from_file_name(file_name)
    if method == 'compression':
        return compression_func(None, T, K, model_type, bitstream)
    elif method == 'blind':
        return DDCM_blind_face_image_restoration.inference(None, T, K, 'NIQE', 1, True, bitstream)
    elif method == 'ccfg':
        return ccfg(None, T, K, -1, model_type, bitstream)
    else:
        raise NotImplementedError()


def validate_K(K):
    if (K & (K - 1)) != 0:
        gr.Warning("For efficient bit usage, K should be a power of 2.")


method_to_func = {
    'compression': partial(decompress_given_bitstream, method='compression'),
    'blind': partial(decompress_given_bitstream, method='blind'),
    'ccfg': partial(decompress_given_bitstream, method='ccfg'),
}

title = "<div style='text-align: center; font-size: 36px; font-weight: bold;'>Compressed Image Generation with Denoising Diffusion Codebook Models</div>"
intro = """
<h3 style="margin-bottom: 10px; text-align: center;">
    <a href="https://ohayonguy.github.io/">Guy Ohayon*</a>&nbsp;,&nbsp;
    <a href="https://hilamanor.github.io/">Hila Manor*</a>&nbsp;,&nbsp;
    <a href="https://tomer.net.technion.ac.il/">Tomer Michaeli</a>&nbsp;,&nbsp;
    <a href="https://elad.cs.technion.ac.il/">Michael Elad</a>
</h3>
<p style="font-size: 12px; text-align: center; margin-bottom: 10px;">
    * Equal contribution
</p>
<h4 style="margin-bottom: 10px; text-align: center;">
    Technion - Israel Institute of Technology
</h5>
<h3 style="margin-bottom: 10px; text-align: center;">
    <a href="https://www.arxiv.org/abs/2502.01189/">[Paper]</a>&nbsp;|&nbsp;
    <a href="https://ddcm-2025.github.io/">[Project Page]</a>&nbsp;|&nbsp;
    <a href="https://github.com/DDCM-2025/ddcm-compressed-image-generation/">[Code]</a>
</h3>
</br></br>
Denoising Diffusion Codebook Models (DDCM) is a novel (and simple) generative approach based on any Denoising Diffusion Model (DDM), that is able to produce high-quality image samples along with their losslessly compressed bit-stream representations.
DDCM can easily be utilized for perceptual image compression, as well as for solving a variety of compressed conditional generation tasks such as text-conditional image generation and image restoration, where each generated sample is accompanied by a compressed bit-stream.
</br></br>
The tabs below correspond to demos of different practical applications. Open each tab to see the application's specific instructions.
</br></br>
<b>Note: The demos below rely on relatively old pre-trained diffusion models such as Stable Diffusion 2.1, simply for the purpose of demonstrating the capabilities of DDCM. Feel free to implement our DDCM-based methods using newer diffusion models to further improve performance.</b>
"""

article = r"""
If you find our work useful, please ⭐ our <a href='https://github.com/DDCM-2025/ddcm-compressed-image-generation' target='_blank'>GitHub repository</a>. Thanks!

📝 **Citation**
```bibtex
@article{ohayon2025compressedimagegenerationdenoising,
      title={Compressed Image Generation with Denoising Diffusion Codebook Models}, 
      author={Guy Ohayon and Hila Manor and Tomer Michaeli and Michael Elad},
      year={2025},
      eprint={2502.01189},
      journal={arXiv},
      primaryClass={eess.IV},
      url={https://arxiv.org/abs/2502.01189}, 
}
```

📋 **License**
This project is released under the <a rel="license" href="https://github.com/DDCM-2025/ddcm-compressed-image-generation/blob/master/LICENSE">MIT license</a>.

📧 **Contact**
If you have any questions, please feel free to contact us at <b>[email protected]</b> (Guy Ohayon) and <b>[email protected]</b> (Hila Manor).
"""

custom_css = """
    .tabs button {
        font-size: 21px !important;
        font-weight: bold !important;
    }
"""

with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
    gr.HTML(title)
    gr.HTML(intro)
    # gr.Markdown("# Compressed Image Generation with Denoising Diffusion Codebook Models")

    with gr.Tab("Image Compression"):
        gr.Markdown(
            "- To change the bit rate, modify the number of diffusion timesteps (T) and/or the codebook sizes (K).")
        gr.Markdown("- The input image will be center-cropped and resized to the specified size (512x512 or 768x768).")
        # gr.Markdown("#### Notes:")
        # gr.Markdown('* Since our methods relies on Stable Diffusion, we resize the input image to 512512 pixels')

        with gr.Row():
            with gr.Column(scale=2):
                input_image = gr.Image(label="Input image", scale=2, image_mode='RGB', type='pil')
                with gr.Group():
                    with gr.Row():
                        T = gr.Number(label="Diffusion timesteps (T)", minimum=50, maximum=1000, value=1000, scale=2)
                        K = gr.Number(label="Size of each codebook (K)", minimum=2, maximum=8192, value=2048, scale=3)
                    with gr.Row():
                        model_type = gr.Radio(["768x768", "512x512"], label="Image size", value="512x512")
                compress = gr.Button("Compress image")

            with gr.Column(scale=3):
                decompressed_image = gr.Image(label="Decompressed image", scale=2)
                compressed_file_out = gr.File(label="Compressed bit-stream (output)", scale=0)

        compress.click(validate_K, inputs=[K]).then(compression_func, inputs=[input_image, T, K, model_type],
                                                    outputs=[decompressed_image, compressed_file_out])

        gr.Examples([
            ["examples/compression/1.jpg", 1000, 256, '512x512'],
            ["examples/compression/2.jpg", 1000, 256, '512x512'],
            ["examples/compression/4.jpg", 1000, 256, '512x512'],
            ["examples/compression/7.jpg", 1000, 256, '512x512'],
            ["examples/compression/8.jpg", 1000, 256, '512x512'],
            ["examples/compression/13.jpg", 1000, 256, '512x512'],
            ["examples/compression/15.jpg", 1000, 256, '512x512'],
            ["examples/compression/17.jpg", 1000, 256, '512x512'],
            ["examples/compression/18.jpg", 1000, 256, '512x512'],
            ["examples/compression/19.jpg", 1000, 256, '512x512'],
            ["examples/compression/21.jpg", 1000, 256, '512x512'],
            ["examples/compression/22.jpg", 1000, 256, '512x512'],
            ["examples/compression/23.jpg", 1000, 256, '512x512'],
        ],
            inputs=[input_image, T, K, model_type],
            outputs=[decompressed_image, compressed_file_out],
            fn=compression_func,
            cache_examples='lazy')

        gr.Markdown("### Decompress a previously generated bit-stream")
        with gr.Row():
            with gr.Column(scale=2):
                bitstream = gr.File(label="Compressed bit-stream (input)", scale=0)
                decompress = gr.Button("Decompress image")

            with gr.Column(scale=3):
                decompressed_image = gr.Image(label="Decompressed image (from uploaded bit-stream)", scale=2)

        decompress.click(method_to_func['compression'], inputs=bitstream, outputs=decompressed_image)

    with gr.Tab("Real-World Face Image Restoration"):
        gr.Markdown(  # "Restore any degraded face image. "
            "Please mark if your input face image is already aligned. "
            "If not, we will try to automatically detect, crop and align the faces, and raise an error if no faces are found. Expect better results if your input image is already aligned.")

        with gr.Row():
            with gr.Column(scale=2):
                with gr.Group():
                    input_image = gr.Image(label="Input image", scale=2, type='filepath')
                    aligned = gr.Checkbox(label='Input face image is aligned')
                with gr.Group():
                    with gr.Row():
                        T = gr.Number(label="Diffusion timesteps (T)", minimum=50, maximum=1000, value=1000)
                        K = gr.Number(label="Size of each codebook (K)", minimum=2, maximum=8192, value=2048)
                    iqa_metric = gr.Radio(['NIQE', 'TOPIQ', 'CLIP-IQA'], label='Perceptual quality measure to optimize',
                                          value='NIQE')
                    iqa_coef = gr.Number(
                        label="Perception-distortion tradeoff coefficient (λ)",
                        info="Higher -> better perceptual quality",
                        # label="Coefficient controlling the perception-distortion tradeoff (higher means better perceptual quality)",
                        minimum=0, maximum=1, value=1)
                restore = gr.Button("Restore and compress")

            with gr.Column(scale=3):
                decompressed_image = gr.Gallery(label="Restored faces gallery", type="numpy", show_label=True,
                                                format="png")
                compressed_file_out = gr.File(label="Compressed bit-stream (output)", scale=0, file_count='multiple')

        restore.click(validate_K, inputs=[K]).then(DDCM_blind_face_image_restoration.inference,
                                                   inputs=[input_image, T, K, iqa_metric, iqa_coef, aligned],
                                                   outputs=[decompressed_image, compressed_file_out])
        gr.Examples([
            ["examples/bfr/00000055.png", 1000, 4096, 'TOPIQ', 0.1, True],
            ["examples/bfr/00000085.png", 1000, 4096, 'TOPIQ', 0.1, True],
            ["examples/bfr/00000113.png", 1000, 4096, 'TOPIQ', 0.1, True],
            ["examples/bfr/00000137.png", 1000, 4096, 'TOPIQ', 0.1, True],
            ["examples/bfr/wider/0034.jpg", 1000, 4096, 'NIQE', 1, True],
            ["examples/bfr/webphoto/00042_00.jpg", 1000, 4096, 'TOPIQ', 0.1, True],
            ["examples/bfr/lfw/Ana_Palacio_0001_00.jpg", 1000, 4096, 'TOPIQ', 0.1, True],
            ["examples/bfr/01.png", 1000, 4096, 'NIQE', 0.1, False],
            ["examples/bfr/03.jpg", 1000, 4096, 'TOPIQ', 0.1, False],
        ],
            inputs=[input_image, T, K, iqa_metric, iqa_coef, aligned],
            outputs=[decompressed_image, compressed_file_out],
            fn=DDCM_blind_face_image_restoration.inference,
            cache_examples='lazy')

        gr.Markdown("### Decompress a previously generated bit-stream")
        with gr.Row():
            with gr.Column(scale=2):
                bitstream = gr.File(label="Compressed bit-stream (input)", scale=0)
                decompress = gr.Button("Decompress image")

            with gr.Column(scale=3):
                decompressed_image = gr.Image(label="Decompressed image (from uploaded bit-stream)", scale=2)

        decompress.click(method_to_func['blind'], inputs=bitstream, outputs=decompressed_image)

    with gr.Tab("Compressed Text-to-Image Generation"):
        gr.Markdown(
            "This application demonstrates the capabilities of our new *compressed* classifier-free guidance method, which *does not require the input condition for decompression*."
            "  \n"  # newline
            "Each image is generated along with its compressed bit-stream representation, and the input condition is implicitly encoded in the bit-stream.")
        # gr.Markdown("### Generate an image and its compressed bit-stream given an input text prompt")
        # gr.Markdown("#### Notes:")
        # gr.Markdown("* The size of the generated image is 512x512")

        with gr.Row():
            with gr.Column(scale=2):
                with gr.Group():
                    text_input = gr.Textbox(label="Input text prompt", scale=1, value="An image of a dog")
                    with gr.Row():
                        T = gr.Number(label="Diffusion timesteps (T)", minimum=50, maximum=1000, value=1000, scale=1)
                        K = gr.Number(label="Size of each codebook (K)", minimum=2, maximum=256, value=128, scale=1)
                    K_tilde = gr.Number(label=r"Sub-sampled codebooks' sizes (K̃)", scale=1,
                                        info="Behaves like a guidance scale", minimum=2, maximum=256, value=32)
                    model_type = gr.Radio(["768x768", "512x512"], label="Image size", value="512x512")
                button = gr.Button("Generate and compress")

            with gr.Column(scale=3):
                decompressed_image = gr.Image(label="Generated image", scale=2)
                compressed_file_out = gr.File(label="Compressed bit-stream (output)", scale=0)

        button.click(validate_K, inputs=[K]).then(ccfg, inputs=[text_input, T, K, K_tilde, model_type],
                                                  outputs=[decompressed_image, compressed_file_out])

        gr.Examples([
            ["An image of a dog", 1000, 64, 4, '512x512'],
            ["Rainbow over the mountains", 1000, 64, 4, '512x512'],
            ["A cat playing soccer", 1000, 64, 4, '512x512'],
        ],
            inputs=[text_input, T, K, K_tilde, model_type],
            outputs=[decompressed_image, compressed_file_out],
            fn=ccfg,
            cache_examples='lazy')
        gr.Markdown("### Decompress a previously generated bit-stream")
        with gr.Row():
            with gr.Column(scale=2):
                bitstream = gr.File(label="Compressed bit-stream (input)", scale=0)
                button = gr.Button("Decompress")
            with gr.Column(scale=3):
                decompressed_image = gr.Image(label="Decompressed image (from uploaded bit-stream)", scale=2)
        button.click(method_to_func['ccfg'], inputs=bitstream, outputs=decompressed_image)

    gr.Markdown(article)

demo.queue()
demo.launch(state_session_capacity=500)