Spaces:

BioMike
/

ClassicalPortraitsVAE

Sleeping

App Files Files Community

BioMike commited on Aug 15, 2024

Commit

5a9c9b2

verified ·

1 Parent(s): f71074a

Upload 9 files

Browse files

Files changed (9) hide show

app.py +20 -0
example_images/image1.jpg +0 -0
example_images/image2.png +0 -0
example_images/image3.jpg +0 -0
example_images/image4.jpg +0 -0
model.py +148 -0
morphing.py +83 -0
requirements.txt +7 -0
vae.py +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import gradio as gr
+from vae import vae
+from morphing import morphing
+from model import ConvVAE
+model = ConvVAE.from_pretrained(
+    model_id="BioMike/classical_portrait_vae",
+    cache_dir="./model_cache",
+    map_location="cpu",
+    strict=True
+).eval()
+demo = gr.TabbedInterface([vae, morphing],
+                          ["Image to Portrait", "Image to Image (Morphing)"]
+                          title="CLassical Portraits VAE",
+                          theme=gr.themes.Base())
+demo.queue()
+demo.launch(debug=True, share=True)

example_images/image1.jpg ADDED Viewed

example_images/image2.png ADDED Viewed

example_images/image3.jpg ADDED Viewed

example_images/image4.jpg ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import json
+import torch
+import torch.nn as nn
+import os
+from pathlib import Path
+from typing import Optional, Union, Dict
+from huggingface_hub import snapshot_download
+import warnings
+class ConvVAE(nn.Module):
+    def __init__(self, latent_size):
+        super(ConvVAE, self).__init__()
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Conv2d(3, 64, 3, stride=2, padding=1),  # (batch, 64, 64, 64)
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Conv2d(64, 128, 3, stride=2, padding=1),  # (batch, 128, 32, 32)
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.Conv2d(128, 256, 3, stride=2, padding=1),  # (batch, 256, 16, 16)
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.Conv2d(256, 512, 3, stride=2, padding=1),  # (batch, 512, 8, 8)
+            nn.BatchNorm2d(512),
+            nn.ReLU()
+        )
+        self.fc_mu = nn.Linear(512 * 8 * 8, latent_size)
+        self.fc_logvar = nn.Linear(512 * 8 * 8, latent_size)
+        self.fc2 = nn.Linear(latent_size, 512 * 8 * 8)
+        self.decoder = nn.Sequential(
+            nn.ConvTranspose2d(512, 256, 4, stride=2, padding=1),  # (batch, 256, 16, 16)
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1),  # (batch, 128, 32, 32)
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),  # (batch, 64, 64, 64)
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.ConvTranspose2d(64, 3, 4, stride=2, padding=1),  # (batch, 3, 128, 128)
+            nn.Tanh()
+        )
+    def forward(self, x):
+        mu, logvar = self.encode(x)
+        z = self.reparameterize(mu, logvar)
+        decoded = self.decode(z)
+        return decoded, mu, logvar
+    def encode(self, x):
+        x = self.encoder(x)
+        x = x.view(x.size(0), -1)
+        mu = self.fc_mu(x)
+        logvar = self.fc_logvar(x)
+        return mu, logvar
+    def reparameterize(self, mu, logvar):
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        return mu + eps * std
+    def decode(self, z):
+        x = self.fc2(z)
+        x = x.view(-1, 512, 8, 8)
+        decoded = self.decoder(x)
+        return decoded
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: str,
+        revision: Optional[str] = None,
+        cache_dir: Optional[Union[str, Path]] = None,
+        force_download: bool = False,
+        proxies: Optional[Dict] = None,
+        resume_download: bool = False,
+        local_files_only: bool = False,
+        token: Union[str, bool, None] = None,
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ):
+        """
+        Load a pretrained model from a given model ID.
+        Args:
+            model_id (str): Identifier of the model to load.
+            revision (Optional[str]): Specific model revision to use.
+            cache_dir (Optional[Union[str, Path]]): Directory to store downloaded models.
+            force_download (bool): Force re-download even if the model exists.
+            proxies (Optional[Dict]): Proxy configuration for downloads.
+            resume_download (bool): Resume interrupted downloads.
+            local_files_only (bool): Use only local files, don't download.
+            token (Union[str, bool, None]): Token for API authentication.
+            map_location (str): Device to map model to. Defaults to "cpu".
+            strict (bool): Enforce strict state_dict loading.
+            **model_kwargs: Additional keyword arguments for model initialization.
+        Returns:
+            An instance of the model loaded from the pretrained weights.
+        """
+        model_dir = Path(model_id)
+        if not model_dir.exists():
+            model_dir = Path(
+                snapshot_download(
+                    repo_id=model_id,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    token=token,
+                    local_files_only=local_files_only,
+                )
+            )
+        config_file = model_dir / "config.json"
+        with open(config_file, 'r') as f:
+            config = json.load(f)
+        latent_size = config.get('latent_size')
+        if latent_size is None:
+            raise ValueError("The configuration file is missing the 'latent_size' key.")
+        model = cls(latent_size, **model_kwargs)
+        model_file = model_dir / "model_conv_vae_256_epoch_304.pth"
+        if not model_file.exists():
+            raise FileNotFoundError(f"The model checkpoint '{model_file}' does not exist.")
+        state_dict = torch.load(model_file, map_location=map_location)
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if k.startswith('_orig_mod.'):
+                new_state_dict[k[len('_orig_mod.'):]] = v
+            else:
+                new_state_dict[k] = v
+        model.load_state_dict(new_state_dict, strict=strict)
+        model.to(map_location)
+        return model

morphing.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+from torchvision import transforms
+from PIL import Image, ImageFilter
+import gradio as gr
+import numpy as np
+import os
+import uuid
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+transform = transforms.Compose([
+    transforms.Resize((128, 128)),
+    transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))
+])
+resize_transform = transforms.Resize((512, 512))
+def load_image(image):
+    image = Image.fromarray(image).convert('RGB')
+    image = transform(image)
+    return image.unsqueeze(0).to(device)
+def interpolate_vectors(v1, v2, num_steps):
+    return [v1 * (1 - alpha) + v2 * alpha for alpha in np.linspace(0, 1, num_steps)]
+def infer_and_interpolate(image1, image2, num_interpolations=24):
+    image1 = load_image(image1)
+    image2 = load_image(image2)
+    with torch.no_grad():
+        mu1, logvar1 = model.encode(image1)
+        mu2, logvar2 = model.encode(image2)
+        interpolated_vectors = interpolate_vectors(mu1, mu2, num_interpolations)
+        decoded_images = [model.decode(vec).squeeze(0) for vec in interpolated_vectors]
+    return decoded_images
+def create_gif(decoded_images, duration=200, apply_blur=False):
+    reversed_images = decoded_images[::-1]
+    all_images = decoded_images + reversed_images
+    pil_images = []
+    for img in all_images:
+        img = (img - img.min()) / (img.max() - img.min())
+        img = (img * 255).byte()
+        pil_img = transforms.ToPILImage()(img.cpu()).convert("RGB")
+        pil_img = resize_transform(pil_img)
+        if apply_blur:
+            pil_img = pil_img.filter(ImageFilter.GaussianBlur(radius=1))
+        pil_images.append(pil_img)
+    gif_filename = f"/tmp/morphing_{uuid.uuid4().hex}.gif"
+    pil_images[0].save(gif_filename, save_all=True, append_images=pil_images[1:], duration=duration, loop=0)
+    return gif_filename
+def create_morphing_gif(image1, image2, num_interpolations=24, duration=200):
+    decoded_images = infer_and_interpolate(image1, image2, num_interpolations)
+    gif_path = create_gif(decoded_images, duration)
+    return gif_path
+examples = [
+    ["example_images/image1.jpg", "example_images/image2.png", 24, 200],
+    ["example_images/image3.jpg", "example_images/image4.jpg", 30, 150],
+]
+with gr.Blocks() as morphing:
+    with gr.Column():
+        with gr.Column():
+            num_interpolations = gr.Slider(minimum=2, maximum=50, value=24, step=1, label="Number of interpolations")
+            duration = gr.Slider(minimum=100, maximum=1000, value=200, step=50, label="Duration per frame (ms)")
+            generate_button = gr.Button("Generate Morphing GIF")
+        output_gif = gr.Image(label="Morphing GIF")
+        with gr.Row():
+            image1 = gr.Image(label="Upload first image", type="numpy")
+            image2 = gr.Image(label="Upload second image", type="numpy")
+    generate_button.click(fn=create_morphing_gif, inputs=[image1, image2, num_interpolations, duration], outputs=output_gif)
+    gr.Examples(examples=examples, inputs=[image1, image2, num_interpolations, duration])

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+torchvision
+PIL
+gradio
+uuid
+pathlib
+huggingface_hub

vae.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+from torchvision import transforms
+from PIL import Image
+import gradio as gr
+import numpy as np
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+transform1 = transforms.Compose([
+    transforms.Resize((128, 128)),  # Resize the image to 128x128 for the model
+    transforms.ToTensor(),
+    transforms.Normalize((0.5,), (0.5,))
+])
+transform2 = transforms.Compose([
+    transforms.Resize((512, 512))  # Resize the image to 512x512 for display
+])
+def load_image(image):
+    image = Image.fromarray(image).convert('RGB')
+    image = transform1(image)
+    return image.unsqueeze(0).to(device)
+def infer_image(image, noise_level):
+    image = load_image(image)
+    with torch.no_grad():
+        mu, logvar = model.encode(image)
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std) * noise_level
+        z = mu + eps * std
+        decoded_image = model.decode(z)
+    decoded_image = decoded_image.squeeze().permute(1, 2, 0).cpu().numpy().astype(np.float32) * 0.5 + 0.5
+    decoded_image = np.clip(decoded_image, 0, 1)
+    decoded_image = Image.fromarray((decoded_image * 255).astype(np.uint8))
+    decoded_image = transform2(decoded_image)
+    return np.array(decoded_image)
+examples = [
+    ["example_images/image1.jpg", 0.1],
+    ["example_images/image2.png", 0.5],
+    ["example_images/image3.jpg", 1.0],
+]
+with gr.Blocks() as vae:
+    noise_slider = gr.Slider(0, 10, value=0.01, step=0.01, label="Noise Level")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label="Upload an image", type="numpy")
+        with gr.Column():
+            output_image = gr.Image(label="Reconstructed Image")
+    input_image.change(fn=infer_image, inputs=[input_image, noise_slider], outputs=output_image)
+    noise_slider.change(fn=infer_image, inputs=[input_image, noise_slider], outputs=output_image)
+    gr.Examples(examples=examples, inputs=[input_image, noise_slider])