|
"""
|
|
This module implements the FaceLocator class, which is a neural network model designed to
|
|
locate and extract facial features from input images or tensors. It uses a series of
|
|
convolutional layers to progressively downsample and refine the facial feature map.
|
|
|
|
The FaceLocator class is part of a larger system that may involve facial recognition or
|
|
similar tasks where precise location and extraction of facial features are required.
|
|
|
|
Attributes:
|
|
conditioning_embedding_channels (int): The number of channels in the output embedding.
|
|
conditioning_channels (int): The number of input channels for the conditioning tensor.
|
|
block_out_channels (Tuple[int]): A tuple of integers representing the output channels
|
|
for each block in the model.
|
|
|
|
The model uses the following components:
|
|
- InflatedConv3d: A convolutional layer that inflates the input to increase the depth.
|
|
- zero_module: A utility function that may set certain parameters to zero for regularization
|
|
or other purposes.
|
|
|
|
The forward method of the FaceLocator class takes a conditioning tensor as input and
|
|
produces an embedding tensor as output, which can be used for further processing or analysis.
|
|
"""
|
|
|
|
from typing import Tuple
|
|
|
|
import torch.nn.functional as F
|
|
from diffusers.models.modeling_utils import ModelMixin
|
|
from torch import nn
|
|
|
|
from .motion_module import zero_module
|
|
from .resnet import InflatedConv3d
|
|
|
|
|
|
class FaceLocator(ModelMixin):
|
|
"""
|
|
The FaceLocator class is a neural network model designed to process and extract facial
|
|
features from an input tensor. It consists of a series of convolutional layers that
|
|
progressively downsample the input while increasing the depth of the feature map.
|
|
|
|
The model is built using InflatedConv3d layers, which are designed to inflate the
|
|
feature channels, allowing for more complex feature extraction. The final output is a
|
|
conditioning embedding that can be used for various tasks such as facial recognition or
|
|
feature-based image manipulation.
|
|
|
|
Parameters:
|
|
conditioning_embedding_channels (int): The number of channels in the output embedding.
|
|
conditioning_channels (int, optional): The number of input channels for the conditioning tensor. Default is 3.
|
|
block_out_channels (Tuple[int], optional): A tuple of integers representing the output channels
|
|
for each block in the model. The default is (16, 32, 64, 128), which defines the
|
|
progression of the network's depth.
|
|
|
|
Attributes:
|
|
conv_in (InflatedConv3d): The initial convolutional layer that starts the feature extraction process.
|
|
blocks (ModuleList[InflatedConv3d]): A list of convolutional layers that form the core of the model.
|
|
conv_out (InflatedConv3d): The final convolutional layer that produces the output embedding.
|
|
|
|
The forward method applies the convolutional layers to the input conditioning tensor and
|
|
returns the resulting embedding tensor.
|
|
"""
|
|
def __init__(
|
|
self,
|
|
conditioning_embedding_channels: int,
|
|
conditioning_channels: int = 3,
|
|
block_out_channels: Tuple[int] = (16, 32, 64, 128),
|
|
):
|
|
super().__init__()
|
|
self.conv_in = InflatedConv3d(
|
|
conditioning_channels, block_out_channels[0], kernel_size=3, padding=1
|
|
)
|
|
|
|
self.blocks = nn.ModuleList([])
|
|
|
|
for i in range(len(block_out_channels) - 1):
|
|
channel_in = block_out_channels[i]
|
|
channel_out = block_out_channels[i + 1]
|
|
self.blocks.append(
|
|
InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1)
|
|
)
|
|
self.blocks.append(
|
|
InflatedConv3d(
|
|
channel_in, channel_out, kernel_size=3, padding=1, stride=2
|
|
)
|
|
)
|
|
|
|
self.conv_out = zero_module(
|
|
InflatedConv3d(
|
|
block_out_channels[-1],
|
|
conditioning_embedding_channels,
|
|
kernel_size=3,
|
|
padding=1,
|
|
)
|
|
)
|
|
|
|
def forward(self, conditioning):
|
|
"""
|
|
Forward pass of the FaceLocator model.
|
|
|
|
Args:
|
|
conditioning (Tensor): The input conditioning tensor.
|
|
|
|
Returns:
|
|
Tensor: The output embedding tensor.
|
|
"""
|
|
embedding = self.conv_in(conditioning)
|
|
embedding = F.silu(embedding)
|
|
|
|
for block in self.blocks:
|
|
embedding = block(embedding)
|
|
embedding = F.silu(embedding)
|
|
|
|
embedding = self.conv_out(embedding)
|
|
|
|
return embedding
|
|
|