Spaces:

mispeech
/

ced-base

Running

File size: 5,496 Bytes

# coding=utf-8
# Copyright 2023 Xiaomi Corporation and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CED model configuration"""


from transformers import PretrainedConfig
from transformers.utils import logging
from transformers.utils.hub import cached_file

logger = logging.get_logger(__name__)

CED_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "mispeech/ced-tiny": "https://huggingface.co/mispeech/ced-tiny/resolve/main/config.json",
}


class CedConfig(PretrainedConfig):
    model_type = "ced"

    r"""
    Configuration class for the CED model.

    Args:
        name (str, optional, *optional*):
            Name of the pre-defined configuration. Can be "ced-tiny", "ced-mini", "ced-small" or "ced-base".
        attn_drop_rate (float, *optional*, defaults to 0.0):
            Dropout probability for attention weights. Default to 0.0.
        depth (int, *optional*, defaults to 12): Number of transformer layers. Default to 12.
        drop_path_rate (float, *optional*, defaults to 0.0): Drop path is taken from timm. Default to 0.0.
        drop_rate (float, *optional*, defaults to 0.0):
            Dropout probability for input embeddings. Default to 0.0.
        embed_dim (int, *optional*, defaults to 768):
            Dimensionality of the audio patch embeddings. Default to 768.
        eval_avg (str, *optional*, defaults to `"mean"`):
            Type of pooling to use for evaluation. Can be "mean", "token", "dm" or "logit". Default to "mean".
        mlp_ratio (float, *optional*, defaults to 4.0):
            Ratio of hidden size in the feedforward layer to the embedding size. Default to 4.0.
        num_heads (int, *optional*, defaults to 12): Number of attention heads. Default to 12.
        outputdim (int, *optional*, defaults to 527): Dimensionality of the output. Default to 527.
        patch_size (int, *optional*, defaults to 16): Size of the patches. Default to 16.
        patch_stride (int, *optional*, defaults to 16): Stride of the patches. Default to 16.
        pooling (str, *optional*, defaults to `"mean"`):
            Type of pooling to use for the output. Can be "mean", "token", "dm" or "logit". Default to "mean".
        qkv_bias (bool, *optional*, defaults to `True`):
            Whether to include bias terms in the query, key and value projections. Default to True.
        target_length (int, *optional*, defaults to 1012): Frames of an audio chunk. Default to 1012.
    """

    def __init__(
        self,
        name=None,
        attn_drop_rate=0.0,
        depth=12,
        drop_path_rate=0.0,
        drop_rate=0.0,
        embed_dim=768,
        eval_avg="mean",
        mlp_ratio=4.0,
        num_heads=12,
        outputdim=527,
        patch_size=16,
        patch_stride=16,
        pooling="mean",
        qkv_bias=True,
        target_length=1012,
        **kwargs,
    ):
        r"""
        TODO: Add docstring
        """

        super().__init__(**kwargs)

        if name == "ced-tiny":
            embed_dim = 192
            num_heads = 3
        elif name == "ced-mini":
            embed_dim = 256
            num_heads = 4
        elif name == "ced-small":
            embed_dim = 384
            num_heads = 6
        elif name == "ced-base":
            embed_dim = 768
            num_heads = 12
        else:
            logger.info("No model name specified for CedConfig, use default settings.")

        assert pooling in ("mean", "token", "dm", "logit")
        self.name = name
        self.attn_drop_rate = attn_drop_rate
        self.center = kwargs.get("center", True)
        self.depth = depth
        self.drop_path_rate = drop_path_rate
        self.drop_rate = drop_rate
        self.embed_dim = embed_dim
        self.eval_avg = eval_avg
        self.f_max = kwargs.get("f_max", 8000)
        self.f_min = kwargs.get("f_min", 0)
        self.hop_size = kwargs.get("hop_size", 160)
        self.mlp_ratio = mlp_ratio
        self.n_fft = kwargs.get("n_fft", 512)
        self.n_mels = kwargs.get("n_mels", 64)
        self.n_mels = kwargs.get("n_mels", 64)
        self.num_heads = num_heads
        self.outputdim = outputdim
        self.pad_last = kwargs.get("pad_last", True)
        self.patch_size = patch_size
        self.patch_stride = patch_stride
        self.pooling = pooling
        self.qkv_bias = qkv_bias
        self.target_length = target_length
        self.win_size = kwargs.get("win_size", 512)
        self.loss = "BCE"

        if self.outputdim == 527:
            with open(cached_file("topel/ConvNeXt-Tiny-AT", "class_labels_indices.csv"), "r") as f:
                self.id2label = {
                    int(line.split(",", maxsplit=3)[0]): line.split(",", maxsplit=3)[2].replace('"', "").strip("\n")
                    for line in f.readlines()[1:]
                }
            self.label2id = {v: k for k, v in self.id2label.items()}
        else:
            self.id2label = None
            self.label2id = None