Spaces:

sjtu-deepvision
/

Dereflection-Any-Image

Running on Zero

File size: 3,382 Bytes

311419e
 
1cedc13
651dfe7
1cedc13
651dfe7
 
 
1cedc13
311419e
 
1cedc13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311419e
651dfe7
311419e
 
1cedc13
311419e
 
1cedc13
311419e
 
 
 
1cedc13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651dfe7
 
1cedc13
 
 
 
651dfe7
 
1cedc13
651dfe7
1cedc13
651dfe7
1cedc13
 
c360cac
651dfe7
1cedc13

import os
import numpy as np
import torch
from PIL import Image
import gradio as gr
from DAI.pipeline_all import DAIPipeline
from DAI.controlnetvae import ControlNetVAEModel
from DAI.decoder import CustomAutoencoderKL
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTextModel, AutoTokenizer

# Initialize device and model paths
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weight_dtype = torch.float32
pretrained_model_name_or_path = "sjtu-deepvision/dereflection-any-image-v0"
pretrained_model_name_or_path2 = "stabilityai/stable-diffusion-2-1"

# Load the model components
controlnet = ControlNetVAEModel.from_pretrained(pretrained_model_name_or_path, subfolder="controlnet", torch_dtype=weight_dtype).to(device)
unet = UNet2DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet", torch_dtype=weight_dtype).to(device)
vae_2 = CustomAutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae_2", torch_dtype=weight_dtype).to(device)
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path2, subfolder="vae").to(device)
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name_or_path2, subfolder="text_encoder").to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path2, subfolder="tokenizer", use_fast=False)

# Create the pipeline
pipe = DAIPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    controlnet=controlnet,
    safety_checker=None,
    scheduler=None,
    feature_extractor=None,
    t_start=0,
).to(device)

# Function to process the image
def process_image(input_image):
    # Convert Gradio input to PIL Image
    input_image = Image.fromarray(input_image)

    # Process the image
    pipe_out = pipe(
        image=input_image,
        prompt="remove glass reflection",
        vae_2=vae_2,
        processing_resolution=None,
    )

    # Convert the output to an image
    processed_frame = (pipe_out.prediction.clip(-1, 1) + 1) / 2
    processed_frame = (processed_frame[0] * 255).astype(np.uint8)
    processed_frame = Image.fromarray(processed_frame)

    return processed_frame

# Gradio interface
def create_gradio_interface():
    # Example images
    example_images = [
        os.path.join("files", "image", f"{i}.png") for i in range(1, 9)
    ]

    with gr.Blocks() as demo:
        gr.Markdown("# Dereflection Any Image")
        with gr.Row():
            with gr.Column():
                input_image = gr.Image(label="Input Image", type="numpy")
                submit_btn = gr.Button("Remove Reflection", variant="primary")
            with gr.Column():
                output_image = gr.Image(label="Processed Image")

        # Add examples
        gr.Examples(
            examples=example_images,
            inputs=input_image,
            outputs=output_image,
            fn=process_image,
            cache_examples=False,  # Cache results for faster loading
            label="Example Images",
        )

        submit_btn.click(
            fn=process_image,
            inputs=input_image,
            outputs=output_image,
        )

    return demo

# Main function to launch the Gradio app
def main():
    demo = create_gradio_interface()
    demo.launch(server_name="0.0.0.0", server_port=7860)

if __name__ == "__main__":
    main()