Spaces:

BooBooWu
/

Vec2Face

Runtime error

App Files Files Community

Haiyu Wu commited on Sep 2, 2024

Commit

918e8a0

1 Parent(s): ae82d2a

vec2face demo

Browse files

Files changed (20) hide show

app.py +247 -0
configs/vec2face/vqgan.yaml +16 -0
models/__init__.py +1 -0
models/__pycache__/__init__.cpython-38.pyc +0 -0
models/__pycache__/iresnet.cpython-38.pyc +0 -0
models/iresnet.py +150 -0
pixel_generator/vec2face/__pycache__/im_decoder.cpython-38.pyc +0 -0
pixel_generator/vec2face/__pycache__/model_vec2face.cpython-38.pyc +0 -0
pixel_generator/vec2face/im_decoder.py +209 -0
pixel_generator/vec2face/model_vec2face.py +357 -0
pixel_generator/vec2face/taming/models/__pycache__/vqgan.cpython-37.pyc +0 -0
pixel_generator/vec2face/taming/models/__pycache__/vqgan.cpython-38.pyc +0 -0
pixel_generator/vec2face/taming/models/vqgan.py +67 -0
pixel_generator/vec2face/taming/modules/__pycache__/discriminator_loss.cpython-38.pyc +0 -0
pixel_generator/vec2face/taming/modules/__pycache__/discriminator_loss.cpython-39.pyc +0 -0
pixel_generator/vec2face/taming/modules/discriminator/__pycache__/model.cpython-38.pyc +0 -0
pixel_generator/vec2face/taming/modules/discriminator/model.py +113 -0
pixel_generator/vec2face/taming/modules/discriminator_loss.py +128 -0
pixel_generator/vec2face/taming/modules/util.py +130 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import sys
+sys.path.append('./')
+import gradio as gr
+import random
+import numpy as np
+from PIL import Image
+from huggingface_hub import hf_hub_download
+from models import iresnet
+from sixdrepnet.model import SixDRepNet
+import pixel_generator.vec2face.model_vec2face as model_vec2face
+MAX_SEED = np.iinfo(np.int32).max
+import torch
+def sample_nearby_vectors(base_vector, epsilons=[0.3, 0.5, 0.7], percentages=[0.4, 0.4, 0.2]):
+    row, col = base_vector.shape
+    norm = torch.norm(base_vector, 2, 1, True)
+    diff = []
+    for i, eps in enumerate(epsilons):
+        diff.append(np.random.normal(0, eps, (int(row * percentages[i]), col)))
+    diff = np.vstack(diff)
+    np.random.shuffle(diff)
+    diff = torch.tensor(diff)
+    generated_samples = base_vector + diff
+    generated_samples = generated_samples / torch.norm(generated_samples, 2, 1, True) * norm
+    return generated_samples
+def initialize_models():
+    device = torch.device('cpu')
+    pose_model_weights = hf_hub_download(repo_id="BooBooWu/Vec2Face", filename="weights/6DRepNet_300W_LP_AFLW2000.pth", local_dir="./")
+    id_model_weights = hf_hub_download(repo_id="BooBooWu/Vec2Face", filename="weights/arcface-r100-glint360k.pth", local_dir="./")
+    quality_model_weights = hf_hub_download(repo_id="BooBooWu/Vec2Face", filename="weights/magface-r100-glint360k.pth", local_dir="./")
+    generator_weights = hf_hub_download(repo_id="BooBooWu/Vec2Face", filename="weights/vec2face_generator.pth", local_dir="./")
+    generator = model_vec2face.__dict__["vec2face_vit_base_patch16"](mask_ratio_mu=0.15, mask_ratio_std=0.25,
+                                                mask_ratio_min=0.1, mask_ratio_max=0.5,
+                                                use_rep=True,
+                                                rep_dim=512,
+                                                rep_drop_prob=0.,
+                                                use_class_label=False)
+    generator = generator.to(device)
+    checkpoint = torch.load(generator_weights, map_location='cpu')
+    generator.load_state_dict(checkpoint['model_vec2face'])
+    generator.eval()
+    id_model = iresnet("100", fp16=True).to(device)
+    id_model.load_state_dict(torch.load(id_model_weights, map_location='cpu'))
+    id_model.eval()
+    quality_model = iresnet("100", fp16=True).to(device)
+    quality_model.load_state_dict(torch.load(quality_model_weights, map_location='cpu'))
+    quality_model.eval()
+    pose_model = SixDRepNet(backbone_name='RepVGG-B1g2',
+                            backbone_file='',
+                            deploy=True,
+                            pretrained=False
+                            ).to(device)
+    pose_model.load_state_dict(torch.load(pose_model_weights))
+    pose_model.eval()
+    return generator, id_model, pose_model, quality_model
+def image_generation(input_image, quality, use_target_pose, pose, dimension):
+    generator, id_model, pose_model, quality_model = initialize_models()
+    generated_images = []
+    if input_image is None:
+        feature = np.random.normal(0, 1.0, (1, 512))
+    else:
+        input_image = np.transpose(input_image, (2, 0, 1))
+        input_image = torch.from_numpy(input_image).unsqueeze(0).float()
+        input_image.div_(255).sub_(0.5).div_(0.5)
+        feature = id_model(input_image).clone().detach().cpu().numpy()
+    if not use_target_pose:
+        features = []
+        norm = np.linalg.norm(feature, 2, 1, True)
+        for i in np.arange(0, 4.8, 0.8):
+            updated_feature = feature
+            updated_feature[0][dimension] = feature[0][dimension] + i
+            updated_feature = updated_feature / np.linalg.norm(updated_feature, 2, 1, True) * norm
+            features.append(updated_feature)
+        features = torch.tensor(np.vstack(features)).float()
+        if quality > 25:
+            images, _ = generator.gen_image(features, quality_model, id_model, q_target=quality)
+        else:
+            _, _, images, *_ = generator(features)
+    else:
+        features = torch.repeat_interleave(torch.tensor(feature), 6, dim=0)
+        features = sample_nearby_vectors(features, [0.7], [1]).float()
+        if quality > 25 and pose > 20:
+            images, _ = generator.gen_image(features, quality_model, id_model, pose_model=pose_model,
+                                            q_target=quality, pose=pose, class_rep=features)
+        else:
+            _, _, images, *_ = generator(features)
+    images = ((images.permute(0, 2, 3, 1).detach().cpu().numpy() + 1) / 2 * 255).astype(np.uint8)
+    for image in images:
+        generated_images.append(Image.fromarray(image))
+    return generated_images
+def process_input(image_input, num1, num2, num3, num4, random_seed, target_quality, use_target_pose, target_pose):
+    # Ensure all dimension numbers are within [0, 512)
+    num1, num2, num3, num4 = [max(0, min(int(n), 511)) for n in [num1, num2, num3, num4]]
+    # Use the provided random seed
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    if image_input is None:
+        input_data = None
+    else:
+        # Process the uploaded image
+        input_data = Image.open(image_input)
+        input_data = np.array(input_data.resize((112, 112)))
+    generated_images = image_generation(input_data, target_quality, use_target_pose, target_pose, [num1, num2, num3, num4])
+    return generated_images
+def select_image(value, images):
+    # Convert the float value (0 to 4) to an integer index (0 to 9)
+    index = int(value / 0.8)
+    return images[index]
+def toggle_inputs(use_pose):
+    return [
+        gr.update(visible=use_pose, interactive=use_pose),  # target_pose
+        gr.update(interactive=not use_pose),  # num1
+        gr.update(interactive=not use_pose),  # num2
+        gr.update(interactive=not use_pose),  # num3
+        gr.update(interactive=not use_pose),  # num4
+    ]
+def main():
+    with gr.Blocks() as demo:
+        title = r"""
+            <h1 align="center">Vec2Face: Scaling Face Dataset Generation with Loosely Constrained Vectors</h1>
+            """
+        description = r"""
+            <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/HaiyuWu/vec2face' target='_blank'><b>Vec2Face: Scaling Face Dataset Generation with Loosely Constrained Vectors</b></a>.<br>
+            How to use:<br>
+            1. Upload an image with a cropped face image or directly click <b>Submit</b> button, six images will be shown on the right.
+            2. You can control the image quality, image pose, and modify the values in the target dimensions to change the output images.
+            3. The output results will shown six results of dimension modification or pose images.
+            4. Since the demo is CPU-based, higher quality and larger pose need longer time to run.
+            5. Enjoy! 😊
+            """
+        gr.Markdown(title)
+        gr.Markdown(description)
+        with gr.Row():
+            with gr.Column():
+                image_file = gr.Image(label="Upload an image (optional)", type="filepath")
+                gr.Markdown("""
+                ## Dimension Modification
+                Enter the values for the dimensions you want to modify (0-511).
+                """)
+                with gr.Row():
+                    num1 = gr.Number(label="Dimension 1", value=0, minimum=0, maximum=511, step=1)
+                    num2 = gr.Number(label="Dimension 2", value=0, minimum=0, maximum=511, step=1)
+                    num3 = gr.Number(label="Dimension 3", value=0, minimum=0, maximum=511, step=1)
+                    num4 = gr.Number(label="Dimension 4", value=0, minimum=0, maximum=511, step=1)
+                random_seed = gr.Number(label="Random Seed", value=42, minimum=0, maximum=MAX_SEED, step=1)
+                target_quality = gr.Slider(label="Minimum Quality", minimum=22, maximum=35, step=1, value=24)
+                with gr.Row():
+                    use_target_pose = gr.Checkbox(label="Use Target Pose")
+                    target_pose = gr.Slider(label="Target Pose", value=0, minimum=0, maximum=90, step=1, visible=False)
+                submit = gr.Button("Submit", variant="primary")
+                gr.Markdown("""
+                            ## Usage tips of Vec2Face
+                            - Directly clicking "Submit" button will give you results from a randomly sampled vector.
+                            - If you want to modify more dimensions, please write your own code. Code snippets in [Vec2Face repo](https://github.com/HaiyuWu/vec2face) might be helpful.
+                            - If you want to create extreme pose image (e.g., >70), please do not set image quality larger than 27.
+                            - <span style="color: red;">!</span> <span style="color: red;">!</span> <span style="color: red;">!</span> **Due to the limitation of SixDRepNet (pose estimator), pose editing results might be corrupted/incorrect. For better performance, you can integrade other pose estimators.** <span style="color: red;">!</span> <span style="color: red;">!</span> <span style="color: red;">!</span>
+                            - For better experience, we suggest you to run code on a GPU machine.
+                            """)
+            with gr.Column():
+                gallery = gr.Image(label="Generated Image")
+                incremental_value_slider = gr.Slider(
+                    label="Result of dimension modification or results of pose images",
+                    minimum=0, maximum=4, step=0.8, value=0
+                )
+                gr.Markdown("""
+                            - These values are added to the dimensions (before normalization), **please ignore it if pose editing is on**.
+                            """)
+        use_target_pose.change(
+            fn=toggle_inputs,
+            inputs=[use_target_pose],
+            outputs=[target_pose, num1, num2, num3, num4]
+        )
+        generated_images = gr.State([])
+        submit.click(
+            fn=process_input,
+            inputs=[image_file, num1, num2, num3, num4, random_seed, target_quality, use_target_pose, target_pose],
+            outputs=[generated_images]
+        ).then(
+            fn=select_image,
+            inputs=[incremental_value_slider, generated_images],
+            outputs=[gallery]
+        )
+        incremental_value_slider.change(
+            fn=select_image,
+            inputs=[incremental_value_slider, generated_images],
+            outputs=[gallery]
+        )
+        article = r"""
+        ---
+        📝 **Citation**
+        <br>
+        If our work is helpful for your research or applications, please cite us via:
+        ```bibtex
+        @article{wu2024vec2face,
+        title={Vec2Face: Scaling Face Dataset Generation with Loosely Constrained Vectors},
+        author={Wu, Haiyu and Singh, Jaskirat and Tian, Sicong and Zheng, Liang and Bowyer, Kevin W.},
+        year={2024}
+        }
+        ```
+        📧 **Contact**
+        <br>
+        If you have any questions, please feel free to open an issue or directly reach us out at <b>[email protected]</b>.
+        """
+        gr.Markdown(article)
+    demo.launch(share=True)
+if __name__ == "__main__":
+    main()

configs/vec2face/vqgan.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+model:
+  target: pixel_generator.vec2face.taming.models.vqgan.VQModel
+  params:
+    embed_dim: 256
+    n_embed: 1024
+    ddconfig:
+      double_z: False
+      z_channels: 256
+      resolution: 112
+      in_channels: 3
+      out_ch: 3
+      ch: 128
+      ch_mult: [ 1,1,2,2,4]  # num_down = len(ch_mult)-1
+      num_res_blocks: 2
+      attn_resolutions: [16]
+      dropout: 0.0

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .iresnet import iresnet

models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (140 Bytes). View file

models/__pycache__/iresnet.cpython-38.pyc ADDED Viewed

Binary file (4.21 kB). View file

models/iresnet.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+using_ckpt = False
+def conv3x3(in_planes, out_planes, stride=1, groups=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=1,
+                     groups=groups,
+                     bias=False)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
+class IBasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(IBasicBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,)
+        self.conv1 = conv3x3(inplanes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.prelu = nn.PReLU(planes)
+        self.conv2 = conv3x3(planes, planes, stride)
+        self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,)
+        self.downsample = downsample
+        self.stride = stride
+    def forward_impl(self, x):
+        identity = x
+        out = self.bn1(x)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.prelu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        return out
+    def forward(self, x):
+        if self.training and using_ckpt:
+            return checkpoint(self.forward_impl, x)
+        else:
+            return self.forward_impl(x)
+class IResNet(nn.Module):
+    def __init__(self,
+                 block, layers, dropout=0.4, num_features=512, zero_init_residual=False,
+                 groups=1, fp16=False):
+        super(IResNet, self).__init__()
+        self.extra_gflops = 0.0
+        self.fp16 = fp16
+        self.inplanes = 64
+        self.groups = groups
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05)
+        self.prelu = nn.PReLU(self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=2)
+        self.layer2 = self._make_layer(block,
+                                       128,
+                                       layers[1],
+                                       stride=2)
+        self.layer3 = self._make_layer(block,
+                                       256,
+                                       layers[2],
+                                       stride=2)
+        self.layer4 = self._make_layer(block,
+                                       512,
+                                       layers[3],
+                                       stride=2)
+        self.bn2 = nn.BatchNorm2d(512, eps=1e-05,)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+        self.fc = nn.Linear(512 * 7 * 7, num_features)
+        self.features = nn.BatchNorm1d(num_features, eps=1e-05)
+        nn.init.constant_(self.features.weight, 1.0)
+        self.features.weight.requires_grad = False
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, 0, 0.1)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, IBasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes, stride),
+                nn.BatchNorm2d(planes, eps=1e-05, ),
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes
+        for _ in range(1, blocks):
+            layers.append(
+                block(self.inplanes,
+                      planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        with torch.cuda.amp.autocast(self.fp16):
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.prelu(x)
+            x = self.layer1(x)
+            x = self.layer2(x)
+            x = self.layer3(x)
+            x = self.layer4(x)
+            x = self.bn2(x)
+            x = torch.flatten(x, 1)
+            x = self.dropout(x)
+        x = self.fc(x.float() if self.fp16 else x)
+        x = self.features(x)
+        return x
+def iresnet(arch, pretrained=False, **kwargs):
+    layer_dict = {"18": [2, 2, 2, 2],
+                  "34": [3, 4, 6, 3],
+                  "50": [3, 4, 14, 3],
+                  "100": [3, 13, 30, 3],
+                  "152": [3, 8, 36, 3],
+                  "200": [3, 13, 30, 3]}
+    model = IResNet(IBasicBlock, layer_dict[arch], **kwargs)
+    if pretrained:
+        raise ValueError()
+    return model

pixel_generator/vec2face/__pycache__/im_decoder.cpython-38.pyc ADDED Viewed

Binary file (4.74 kB). View file

pixel_generator/vec2face/__pycache__/model_vec2face.cpython-38.pyc ADDED Viewed

Binary file (12.3 kB). View file

pixel_generator/vec2face/im_decoder.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import torch
+import torch.nn as nn
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1,
+                                     bias=False)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1,
+                                     bias=False)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(out_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1,
+                                                     bias=False)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(out_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0,
+                                                    bias=False)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(h)
+            else:
+                x = self.nin_shortcut(h)
+        return x+h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, **ignorekwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks):
+                h = self.up[i_level].block[i_block](h, temb)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

pixel_generator/vec2face/model_vec2face.py ADDED Viewed

	@@ -0,0 +1,357 @@

+from functools import partial
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from timm.models.vision_transformer import PatchEmbed, DropPath, Mlp
+from omegaconf import OmegaConf
+import numpy as np
+import scipy.stats as stats
+from pixel_generator.vec2face.im_decoder import Decoder
+from sixdrepnet.model import utils
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q.float() @ k.float().transpose(-2, -1)) * self.scale
+        attn = attn - torch.max(attn, dim=-1, keepdim=True)[0]
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, return_attention=False):
+        with torch.cuda.amp.autocast(enabled=False):
+            if return_attention:
+                _, attn = self.attn(self.norm1(x))
+                return attn
+            else:
+                y, _ = self.attn(self.norm1(x))
+                x = x + self.drop_path(y)
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+            return x
+class LabelSmoothingCrossEntropy(nn.Module):
+    """ NLL loss with label smoothing.
+    """
+    def __init__(self, smoothing=0.1):
+        super(LabelSmoothingCrossEntropy, self).__init__()
+        assert smoothing < 1.0
+        self.smoothing = smoothing
+        self.confidence = 1. - smoothing
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    def __init__(self, hidden_size, max_position_embeddings, dropout=0.1):
+        super().__init__()
+        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(max_position_embeddings).expand((1, -1)))
+        torch.nn.init.normal_(self.position_embeddings.weight, std=.02)
+    def forward(
+            self, input_ids
+    ):
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+        position_ids = self.position_ids[:, :seq_length]
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = input_ids + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class MaskedGenerativeEncoderViT(nn.Module):
+    """ Masked Autoencoder with VisionTransformer backbone
+    """
+    def __init__(self, img_size=112, patch_size=7, in_chans=3,
+                 embed_dim=1024, depth=24, num_heads=16,
+                 decoder_embed_dim=512, decoder_depth=8, decoder_num_heads=16,
+                 mlp_ratio=4., norm_layer=nn.LayerNorm, norm_pix_loss=False,
+                 mask_ratio_min=0.5, mask_ratio_max=1.0, mask_ratio_mu=0.55, mask_ratio_std=0.25,
+                 use_rep=True, rep_dim=512,
+                 rep_drop_prob=0.0,
+                 use_class_label=False):
+        super().__init__()
+        assert not (use_rep and use_class_label)
+        # --------------------------------------------------------------------------
+        vqgan_config = OmegaConf.load('configs/vec2face/vqgan.yaml').model
+        self.token_emb = BertEmbeddings(hidden_size=embed_dim,
+                                        max_position_embeddings=49 + 1,
+                                        dropout=0.1)
+        self.use_rep = use_rep
+        self.use_class_label = use_class_label
+        if self.use_rep:
+            print("Use representation as condition!")
+            self.latent_prior_proj_f = nn.Linear(rep_dim, embed_dim, bias=True)
+        # CFG config
+        self.rep_drop_prob = rep_drop_prob
+        self.feature_token = nn.Linear(1, 49, bias=True)
+        self.center_token = nn.Linear(embed_dim, 49, bias=True)
+        self.im_decoder = Decoder(**vqgan_config.params.ddconfig)
+        self.im_decoder_proj = nn.Linear(embed_dim, vqgan_config.params.ddconfig.z_channels)
+        # Vec2Face variant masking ratio
+        self.mask_ratio_min = mask_ratio_min
+        self.mask_ratio_generator = stats.truncnorm((mask_ratio_min - mask_ratio_mu) / mask_ratio_std,
+                                                    (mask_ratio_max - mask_ratio_mu) / mask_ratio_std,
+                                                    loc=mask_ratio_mu, scale=mask_ratio_std)
+        # --------------------------------------------------------------------------
+        # Vec2Face encoder specifics
+        dropout_rate = 0.1
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.blocks = nn.ModuleList([
+            Block(embed_dim, num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer,
+                  drop=dropout_rate, attn_drop=dropout_rate)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        # --------------------------------------------------------------------------
+        # Vec2Face decoder specifics
+        self.decoder_embed = nn.Linear(embed_dim, decoder_embed_dim, bias=True)
+        self.pad_with_cls_token = True
+        self.decoder_pos_embed_learned = nn.Parameter(
+            torch.zeros(1, num_patches + 1, decoder_embed_dim), requires_grad=True)  # learnable pos embedding
+        self.decoder_blocks = nn.ModuleList([
+            Block(decoder_embed_dim, decoder_num_heads, mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer,
+                  drop=dropout_rate, attn_drop=dropout_rate)
+            for i in range(decoder_depth)])
+        self.decoder_norm = norm_layer(decoder_embed_dim)
+        # --------------------------------------------------------------------------
+        self.initialize_weights()
+    def initialize_weights(self):
+        w = self.patch_embed.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        torch.nn.init.normal_(self.decoder_pos_embed_learned, std=.02)
+        torch.nn.init.xavier_uniform_(self.feature_token.weight)
+        torch.nn.init.xavier_uniform_(self.center_token.weight)
+        torch.nn.init.xavier_uniform_(self.latent_prior_proj_f.weight)
+        torch.nn.init.xavier_uniform_(self.decoder_embed.weight)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward_encoder(self, rep):
+        # expand to feature map
+        device = rep.device
+        encode_feature = self.latent_prior_proj_f(rep)
+        feature_token = self.feature_token(encode_feature.unsqueeze(-1)).permute(0, 2, 1)
+        gt_indices = torch.cat((encode_feature.unsqueeze(1), feature_token), dim=1).clone().detach()
+        # masked row indices
+        bsz, seq_len, _ = feature_token.size()
+        mask_ratio_min = self.mask_ratio_min
+        mask_rate = self.mask_ratio_generator.rvs(1)[0]
+        num_dropped_tokens = int(np.ceil(seq_len * mask_ratio_min))
+        num_masked_tokens = int(np.ceil(seq_len * mask_rate))
+        # it is possible that two elements of the noise is the same, so do a while loop to avoid it
+        while True:
+            noise = torch.rand(bsz, seq_len, device=rep.device)  # noise in [0, 1]
+            sorted_noise, _ = torch.sort(noise, dim=1)  # ascend: small is remove, large is keep
+            cutoff_drop = sorted_noise[:, num_dropped_tokens - 1:num_dropped_tokens]
+            cutoff_mask = sorted_noise[:, num_masked_tokens - 1:num_masked_tokens]
+            token_drop_mask = (noise <= cutoff_drop).float()
+            token_all_mask = (noise <= cutoff_mask).float()
+            if token_drop_mask.sum() == bsz * num_dropped_tokens and \
+                    token_all_mask.sum() == bsz * num_masked_tokens:
+                break
+            else:
+                print("Rerandom the noise!")
+        token_all_mask_bool = token_all_mask.bool()
+        encode_feature_expanded = encode_feature.unsqueeze(1).expand(-1, feature_token.shape[1], -1)
+        feature_token[token_all_mask_bool] = encode_feature_expanded[token_all_mask_bool]
+        # concatenate with image feature
+        feature_token = torch.cat([encode_feature.unsqueeze(1), feature_token], dim=1)
+        token_drop_mask = torch.cat([torch.zeros(feature_token.size(0), 1).to(device), token_drop_mask], dim=1)
+        token_all_mask = torch.cat([torch.zeros(feature_token.size(0), 1).to(device), token_all_mask], dim=1)
+        # bert embedding
+        input_embeddings = self.token_emb(feature_token)
+        bsz, seq_len, emb_dim = input_embeddings.shape
+        # dropping
+        token_keep_mask = 1 - token_drop_mask
+        input_embeddings_after_drop = input_embeddings[token_keep_mask.nonzero(as_tuple=True)].reshape(bsz, -1, emb_dim)
+        # apply Transformer blocks
+        x = input_embeddings_after_drop
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x, gt_indices, token_drop_mask, token_all_mask
+    def forward_decoder(self, x, token_drop_mask, token_all_mask):
+        # embed incomplete feature map
+        x = self.decoder_embed(x)
+        # fill masked positions with image feature
+        mask_tokens = x[:, 0:1].repeat(1, token_all_mask.shape[1], 1)
+        x_after_pad = mask_tokens.clone()
+        x_after_pad[(1 - token_drop_mask).nonzero(as_tuple=True)] = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        x_after_pad = torch.where(token_all_mask.unsqueeze(-1).bool(), mask_tokens, x_after_pad)
+        # add pos embed
+        x = x_after_pad + self.decoder_pos_embed_learned
+        # apply Transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        logits = self.decoder_norm(x)
+        bsz, _, emb_dim = logits.shape
+        # an image decoder
+        decoder_proj = self.im_decoder_proj(logits[:, 1:, :].reshape(bsz, 7, 7, emb_dim)).permute(0, 3, 1, 2)
+        return decoder_proj, logits
+    def get_last_layer(self):
+        return self.im_decoder.conv_out.weight
+    def forward(self, rep):
+        last_layer = self.get_last_layer()
+        latent, gt_indices, token_drop_mask, token_all_mask = self.forward_encoder(rep)
+        decoder_proj, logits = self.forward_decoder(latent, token_drop_mask, token_all_mask)
+        image = self.im_decoder(decoder_proj)
+        return gt_indices, logits, image, last_layer, token_all_mask
+    def gen_image(self, rep, quality_model, fr_model, pose_model=None, age_model=None, class_rep=None,
+                  num_iter=1, lr=1e-1, q_target=27, pose=60):
+        rep_copy = rep.clone().detach().requires_grad_(True)
+        optm = optim.Adam([rep_copy], lr=lr)
+        i = 0
+        while i < num_iter:
+            latent, _, token_drop_mask, token_all_mask = self.forward_encoder(rep_copy)
+            decoder_proj, _ = self.forward_decoder(latent, token_drop_mask, token_all_mask)
+            image = self.im_decoder(decoder_proj).clip(max=1., min=-1.)
+            # feature comparison
+            out_feature = fr_model(image)
+            if class_rep is None:
+                id_loss = torch.mean(1 - torch.cosine_similarity(out_feature, rep))
+            else:
+                distance = 1 - torch.cosine_similarity(out_feature, class_rep)
+                id_loss = torch.mean(torch.where(distance > 0.5, distance, torch.zeros_like(distance)))
+            quality = quality_model(image)
+            norm = torch.norm(quality, 2, 1, True)
+            q_loss = torch.where(norm < q_target, q_target - norm, torch.zeros_like(norm))
+            pose_loss = 0
+            if pose_model is not None:
+                # sixdrepnet
+                bgr_img = image[:, [2, 1, 0], :, :]
+                pose_info = pose_model(((bgr_img + 1) / 2))
+                pose_info = utils.compute_euler_angles_from_rotation_matrices(
+                    pose_info) * 180 / np.pi
+                yaw_loss = torch.abs(pose - torch.abs(pose_info[:, 1].clip(min=-90, max=90)))
+                pose_loss = torch.mean(yaw_loss)
+            q_loss = torch.mean(q_loss)
+            if pose_loss > 5 or id_loss > 0.4 or q_loss > 1:
+                i -= 1
+            loss = id_loss * 100 + q_loss + pose_loss
+            optm.zero_grad()
+            loss.backward(retain_graph=True)
+            optm.step()
+            i += 1
+        latent, _, token_drop_mask, token_all_mask = self.forward_encoder(rep_copy)
+        decoder_proj, _ = self.forward_decoder(latent, token_drop_mask, token_all_mask)
+        image = self.im_decoder(decoder_proj).clip(max=1., min=-1.)
+        return image, rep_copy.detach()
+def vec2face_vit_base_patch16(**kwargs):
+    model = MaskedGenerativeEncoderViT(
+        patch_size=16, embed_dim=768, depth=12, num_heads=12,
+        decoder_embed_dim=768, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def vec2face_vit_large_patch16(**kwargs):
+    model = MaskedGenerativeEncoderViT(
+        patch_size=16, embed_dim=1024, depth=24, num_heads=16,
+        decoder_embed_dim=1024, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def vec2face_vit_huge_patch16(**kwargs):
+    model = MaskedGenerativeEncoderViT(
+        patch_size=16, embed_dim=1280, depth=32, num_heads=16,
+        decoder_embed_dim=1280, decoder_depth=8, decoder_num_heads=16,
+        mlp_ratio=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model

pixel_generator/vec2face/taming/models/__pycache__/vqgan.cpython-37.pyc ADDED Viewed

Binary file (2.45 kB). View file

pixel_generator/vec2face/taming/models/__pycache__/vqgan.cpython-38.pyc ADDED Viewed

Binary file (2.48 kB). View file

pixel_generator/vec2face/taming/models/vqgan.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import pytorch_lightning as pl
+from pixel_generator.mage.taming.modules.diffusionmodules.model import Encoder, Decoder
+from pixel_generator.mage.taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+class VQModel(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 remap=None,
+                 sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
+                                        remap=remap, sane_index_shape=sane_index_shape)
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.image_key = image_key
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")
+        if "state_dict" in sd.keys():
+            sd = sd["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        print("Strict load")
+        self.load_state_dict(sd, strict=True)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff

pixel_generator/vec2face/taming/modules/__pycache__/discriminator_loss.cpython-38.pyc ADDED Viewed

Binary file (4.47 kB). View file

pixel_generator/vec2face/taming/modules/__pycache__/discriminator_loss.cpython-39.pyc ADDED Viewed

Binary file (4.46 kB). View file

pixel_generator/vec2face/taming/modules/discriminator/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (4.1 kB). View file

pixel_generator/vec2face/taming/modules/discriminator/model.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch.nn as nn
+import torch
+####################################ViT-VQGAN########################################
+# https://github.com/lucidrains/parti-pytorch/blob/main/parti_pytorch/vit_vqgan.py#L171
+#####################################################################################
+def default(val, d):
+    return val if exists(val) else d
+def exists(val):
+    return val is not None
+def leaky_relu(p = 0.1):
+    return nn.LeakyReLU(0.1)
+class CrossEmbedLayer(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        kernel_sizes,
+        dim_out = None,
+        stride = 2
+    ):
+        super().__init__()
+        assert all([*map(lambda t: (t % 2) == (stride % 2), kernel_sizes)])
+        dim_out = default(dim_out, dim_in)
+        kernel_sizes = sorted(kernel_sizes)
+        num_scales = len(kernel_sizes)
+        # calculate the dimension at each scale
+        dim_scales = [int(dim_out / (2 ** i)) for i in range(1, num_scales)]
+        dim_scales = [*dim_scales, dim_out - sum(dim_scales)]
+        self.convs = nn.ModuleList([])
+        for kernel, dim_scale in zip(kernel_sizes, dim_scales):
+            self.convs.append(nn.Conv2d(dim_in, dim_scale, kernel, stride = stride, padding = (kernel - stride) // 2))
+    def forward(self, x):
+        fmaps = tuple(map(lambda conv: conv(x), self.convs))
+        return torch.cat(fmaps, dim = 1)
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        groups = 8
+    ):
+        super().__init__()
+        self.groupnorm = nn.GroupNorm(groups, dim)
+        self.activation = leaky_relu()
+        self.project = nn.Conv2d(dim, dim_out, 3, padding = 1)
+    def forward(self, x, scale_shift = None):
+        x = self.groupnorm(x)
+        x = self.activation(x)
+        return self.project(x)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None,
+        *,
+        groups = 8
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.block = Block(dim, dim_out, groups = groups)
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x):
+        h = self.block(x)
+        return h + self.res_conv(x)
+class Discriminator(nn.Module):
+    def __init__(
+        self,
+        dims,
+        channels = 3,
+        groups = 8,
+        init_kernel_size = 5,
+        cross_embed_kernel_sizes = (3, 7, 15)
+    ):
+        super().__init__()
+        init_dim, *_, final_dim = dims
+        dim_pairs = zip(dims[:-1], dims[1:])
+        self.layers = nn.ModuleList([nn.Sequential(
+            CrossEmbedLayer(channels, cross_embed_kernel_sizes, init_dim, stride = 1),
+            leaky_relu()
+        )])
+        for dim_in, dim_out in dim_pairs:
+            self.layers.append(nn.Sequential(
+                nn.Conv2d(dim_in, dim_out, 4, stride = 2, padding = 1),
+                leaky_relu(),
+                nn.GroupNorm(groups, dim_out),
+                ResnetBlock(dim_out, dim_out),
+            ))
+        self.to_logits = nn.Sequential( # return 5 x 5, for PatchGAN-esque training
+            nn.Conv2d(final_dim, final_dim, 1),
+            leaky_relu(),
+            nn.Conv2d(final_dim, 1, 4)
+        )
+    def forward(self, x):
+        for net in self.layers:
+            x = net(x)
+        return self.to_logits(x)

pixel_generator/vec2face/taming/modules/discriminator_loss.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models import iresnet
+from lpips.lpips import LPIPS
+from pytorch_msssim import SSIM
+def adopt_weight(weight, global_step, threshold=0, value=0.):
+    if global_step < threshold:
+        weight = value
+    return weight
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1. - logits_real))
+    loss_fake = torch.mean(F.relu(1. + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def mse_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean((logits_real - 1.) ** 2)
+    loss_fake = torch.mean(logits_fake ** 2)
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+            torch.mean(torch.nn.functional.softplus(-logits_real)) +
+            torch.mean(torch.nn.functional.softplus(logits_fake)))
+    return d_loss
+def create_fr_model(model_path, depth="100"):
+    model = iresnet(depth)
+    model.load_state_dict(torch.load(model_path))
+    # model.half()
+    return model
+def downscale(img: torch.tensor):
+    half_size = img.shape[-1] // 8
+    img = F.interpolate(img, size=(half_size, half_size), mode='bicubic', align_corners=False)
+    return img
+class VQLPIPSWithDiscriminator(nn.Module):
+    def __init__(self, disc_start=1000, disc_factor=1.0, disc_weight=1.0,
+                 disc_conditional=False, disc_loss="mse", id_loss="mse",
+                 fr_model="./models/arcface-r100-glint360k.pth"):
+        super().__init__()
+        assert disc_loss in ["hinge", "vanilla", "mse", "smooth"]
+        self.loss_name = disc_loss
+        self.perceptual_loss = LPIPS().eval()
+        self.discriminator_iter_start = disc_start
+        if disc_loss == "hinge":
+            self.disc_loss = hinge_d_loss
+        elif disc_loss == "vanilla":
+            self.disc_loss = vanilla_d_loss
+        elif disc_loss == "mse":
+            self.disc_loss = mse_d_loss
+        else:
+            raise ValueError(f"Unknown GAN loss '{disc_loss}'.")
+        print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.")
+        self.fr_model = create_fr_model(fr_model).eval()
+        if id_loss == "mse":
+            self.feature_loss = nn.MSELoss()
+        elif id_loss == "cosine":
+            self.feature_loss = nn.CosineSimilarity()
+        self.disc_factor = disc_factor
+        self.discriminator_weight = disc_weight
+        self.disc_conditional = disc_conditional
+        self.ssim_loss = SSIM(data_range=1, size_average=True, channel=3)
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
+        if last_layer is not None:
+            nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        else:
+            nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0]
+            g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(self, im_features, gt_indices, logits, gt_img, image, discriminator, emb_loss,
+                epoch, last_layer=None, cond=None, mask=None):
+        rec_loss = (image - gt_img) ** 2
+        if epoch >= 0:
+            gen_feature = self.fr_model(image)
+            feature_loss = torch.mean(1 - torch.cosine_similarity(im_features, gen_feature))
+        else:
+            feature_loss = 0
+        p_loss = self.perceptual_loss(image, gt_img) * 2
+        with torch.cuda.amp.autocast(enabled=False):
+            ssim_loss = 1 - self.ssim_loss((image.float() + 1) / 2, (gt_img + 1) / 2)
+        logits_fake = discriminator(image)
+        logits_real_d = discriminator(gt_img.detach())
+        logits_fake_d = discriminator(image.detach())
+        if mask is None:
+            token_loss = (logits[:, 1:, :] - gt_indices[:, 1:, :])
+            token_loss = torch.mean(token_loss ** 2)
+        else:
+            token_loss = torch.abs((logits[:, 1:, :] - gt_indices[:, 1:, :])) * mask[:, 1:, None]
+            token_loss = token_loss.sum() / mask[:, 1:].sum()
+        # token_loss = 0
+        nll_loss = torch.mean(rec_loss + p_loss) + \
+                   ssim_loss + \
+                   token_loss + feature_loss + emb_loss
+        # generator update
+        g_loss = -torch.mean(logits_fake)
+        d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer)
+        disc_factor = adopt_weight(self.disc_factor, epoch, threshold=self.discriminator_iter_start)
+        ae_loss = nll_loss + d_weight * disc_factor * g_loss
+        # second pass for discriminator update
+        disc_factor = adopt_weight(self.disc_factor, epoch, threshold=self.discriminator_iter_start)
+        d_loss = disc_factor * self.disc_loss(logits_real_d, logits_fake_d)
+        return ae_loss, d_loss, token_loss, rec_loss, ssim_loss, p_loss, feature_loss

pixel_generator/vec2face/taming/modules/util.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import torch.nn as nn
+def count_params(model):
+    total_params = sum(p.numel() for p in model.parameters())
+    return total_params
+class ActNorm(nn.Module):
+    def __init__(self, num_features, logdet=False, affine=True,
+                 allow_reverse_init=False):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height*width*torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class Labelator(AbstractEncoder):
+    """Net2Net Interface for Class-Conditional Model"""
+    def __init__(self, n_classes, quantize_interface=True):
+        super().__init__()
+        self.n_classes = n_classes
+        self.quantize_interface = quantize_interface
+    def encode(self, c):
+        c = c[:,None]
+        if self.quantize_interface:
+            return c, None, [None, None, c.long()]
+        return c
+class SOSProvider(AbstractEncoder):
+    # for unconditional training
+    def __init__(self, sos_token, quantize_interface=True):
+        super().__init__()
+        self.sos_token = sos_token
+        self.quantize_interface = quantize_interface
+    def encode(self, x):
+        # get batch size from data and replicate sos_token
+        c = torch.ones(x.shape[0], 1)*self.sos_token
+        c = c.long().to(x.device)
+        if self.quantize_interface:
+            return c, None, [None, None, c]
+        return c

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==1.12.0
+numpy==1.24.3
+torchvision==0.13.0
+imageio==2.9.0
+omegaconf==2.1.1
+scipy==1.10.1
+sixdrepnet==0.1.6
+timm==0.9.16
+gradio==4.42.0
+huggingface-hub==0.24.6