Spaces:

mispeech
/

ced-base

Runtime error

App Files Files Community

jimbozhang commited on Feb 15, 2024

Commit

2e9a543

verified ·

1 Parent(s): 9b348cb

Update 3 files

Browse files

Files changed (3) hide show

ced_model/configuration_ced.py +3 -6
ced_model/feature_extraction_ced.py +50 -8
ced_model/modeling_ced.py +28 -45

ced_model/configuration_ced.py CHANGED Viewed

@@ -123,15 +123,12 @@ class CedConfig(PretrainedConfig):
         self.qkv_bias = qkv_bias
         self.target_length = target_length
         self.win_size = kwargs.get("win_size", 512)
         if self.outputdim == 527:
-            with open(
-                cached_file("topel/ConvNeXt-Tiny-AT", "class_labels_indices.csv"), "r"
-            ) as f:
                 self.id2label = {
-                    int(line.split(",", maxsplit=3)[0]): line.split(",", maxsplit=3)[2]
-                    .replace('"', "")
-                    .strip("\n")
                     for line in f.readlines()[1:]
                 }
             self.label2id = {v: k for k, v in self.id2label.items()}

         self.qkv_bias = qkv_bias
         self.target_length = target_length
         self.win_size = kwargs.get("win_size", 512)
+        self.loss = "BCE"
         if self.outputdim == 527:
+            with open(cached_file("topel/ConvNeXt-Tiny-AT", "class_labels_indices.csv"), "r") as f:
                 self.id2label = {
+                    int(line.split(",", maxsplit=3)[0]): line.split(",", maxsplit=3)[2].replace('"', "").strip("\n")
                     for line in f.readlines()[1:]
                 }
             self.label2id = {v: k for k, v in self.id2label.items()}

ced_model/feature_extraction_ced.py CHANGED Viewed

@@ -16,7 +16,7 @@
 Feature extractor class for CED.
 """
-from typing import Optional, Union
 import numpy as np
 import torch
@@ -77,10 +77,14 @@ class CedFeatureExtractor(SequenceFeatureExtractor):
         self.f_max = f_max
         self.hop_size = hop_size
     def __call__(
         self,
-        x: Union[np.ndarray, torch.Tensor],
         sampling_rate: Optional[int] = None,
         return_tensors="pt",
     ) -> BatchFeature:
         r"""
@@ -88,6 +92,14 @@ class CedFeatureExtractor(SequenceFeatureExtractor):
         Args:
             x: Input audio signal tensor.
         Returns:
             BatchFeature: A dictionary containing the extracted features.
@@ -96,9 +108,7 @@ class CedFeatureExtractor(SequenceFeatureExtractor):
             sampling_rate = self.sampling_rate
         if return_tensors != "pt":
-            raise NotImplementedError(
-                "Only return_tensors='pt' is currently supported."
-            )
         mel_spectrogram = audio_transforms.MelSpectrogram(
             f_min=self.f_min,
@@ -112,10 +122,42 @@ class CedFeatureExtractor(SequenceFeatureExtractor):
         )
         amplitude_to_db = audio_transforms.AmplitudeToDB(top_db=120)
-        x = torch.from_numpy(x).float() if isinstance(x, np.ndarray) else x.float()
-        if x.dim() == 1:
-            x = x.unsqueeze(0)
         x = mel_spectrogram(x)
         x = amplitude_to_db(x)
         return BatchFeature({"input_values": x})

 Feature extractor class for CED.
 """
+from typing import List, Optional, Union
 import numpy as np
 import torch
         self.f_max = f_max
         self.hop_size = hop_size
+        self.model_input_names = ["input_values"]
     def __call__(
         self,
+        x: Union[np.ndarray, torch.Tensor, List[np.ndarray], List[torch.Tensor]],
         sampling_rate: Optional[int] = None,
+        max_length: Optional[int] = 16000,
+        truncation: bool = True,
         return_tensors="pt",
     ) -> BatchFeature:
         r"""
         Args:
             x: Input audio signal tensor.
+            sampling_rate (int, *optional*, defaults to `None`):
+                Sampling rate of the input audio signal.
+            max_length (int, *optional*, defaults to 16000):
+                Maximum length of the input audio signal.
+            truncation (bool, *optional*, defaults to `True`):
+                Whether to truncate the input signal to max_length.
+            return_tensors (str, *optional*, defaults to "pt"):
+                If set to "pt", the return type will be a PyTorch tensor.
         Returns:
             BatchFeature: A dictionary containing the extracted features.
             sampling_rate = self.sampling_rate
         if return_tensors != "pt":
+            raise NotImplementedError("Only return_tensors='pt' is currently supported.")
         mel_spectrogram = audio_transforms.MelSpectrogram(
             f_min=self.f_min,
         )
         amplitude_to_db = audio_transforms.AmplitudeToDB(top_db=120)
+        if isinstance(x, np.ndarray):
+            if x.ndim == 1:
+                x = x[np.newaxis, :]
+            if x.ndim != 2:
+                raise ValueError("np.ndarray input must be a 1D or 2D.")
+            x = torch.from_numpy(x)
+        elif isinstance(x, torch.Tensor):
+            if x.dim() == 1:
+                x = x.unsqueeze(0)
+            if x.dim() != 2:
+                raise ValueError("torch.Tensor input must be a 1D or 2D.")
+        elif isinstance(x, (list, tuple)):
+            longest_length = max(x_.shape[0] for x_ in x)
+            if not truncation and max_length < longest_length:
+                max_length = longest_length
+            if all(isinstance(x_, np.ndarray) for x_ in x):
+                if not all(x_.ndim == 1 for x_ in x):
+                    raise ValueError("All np.ndarray in a list must be 1D.")
+                x_trim = [x_[:max_length] for x_ in x]
+                x_pad = [np.pad(x_, (0, max_length - x_.shape[0]), mode="constant", constant_values=0) for x_ in x_trim]
+                x = torch.stack([torch.from_numpy(x_) for x_ in x_pad])
+            elif all(isinstance(x_, torch.Tensor) for x_ in x):
+                if not all(x_.dim() == 1 for x_ in x):
+                    raise ValueError("All torch.Tensor in a list must be 1D.")
+                x_pad = [torch.nn.functional.pad(x_, (0, max_length - x_.shape[0]), value=0) for x_ in x]
+                x = torch.stack(x_pad)
+            else:
+                raise ValueError("Input list must be numpy arrays or PyTorch tensors.")
+        else:
+            raise ValueError(
+                "Input must be a numpy array, a list of numpy arrays, a PyTorch tensor, or a list of PyTorch tensor."
+            )
+        x = x.float()
         x = mel_spectrogram(x)
         x = amplitude_to_db(x)
         return BatchFeature({"input_values": x})

ced_model/modeling_ced.py CHANGED Viewed

@@ -106,9 +106,7 @@ class CedAudioPatchEmbed(nn.Module):
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = flatten
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride
-        )
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
     def forward(self, x):
@@ -143,11 +141,7 @@ class CedAttention(nn.Module):
     def forward(self, x):
         B, N, C = x.shape
-        qkv = (
-            self.qkv(x)
-            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
-            .permute(2, 0, 3, 1, 4)
-        )
         q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
         attn = (q @ k.transpose(-2, -1)) * self.scale
@@ -221,9 +215,7 @@ class DropPath(nn.Module):
         return f"drop_prob={round(self.drop_prob,3):0.3f}"
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
     This is the same as the DropConnect impl I (https://github.com/rwightman) created for EfficientNet, etc networks,
@@ -236,9 +228,7 @@ def drop_path(
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
     if keep_prob > 0.0 and scale_by_keep:
         random_tensor.div_(keep_prob)
@@ -373,17 +363,11 @@ class CedModel(CedPreTrainedModel):
             patch_stride=config.patch_stride,
         )
-        self.time_pos_embed = nn.Parameter(
-            torch.randn(1, config.embed_dim, 1, self.patch_embed.grid_size[1]) * 0.02
-        )
-        self.freq_pos_embed = nn.Parameter(
-            torch.randn(1, config.embed_dim, self.patch_embed.grid_size[0], 1) * 0.02
-        )
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
-        dpr = [
-            x.item() for x in torch.linspace(0, config.drop_path_rate, config.depth)
-        ]  # stochastic depth decay rule
         self.pos_drop = nn.Dropout(p=config.drop_rate)
         self.blocks = nn.Sequential(
             *[
@@ -407,13 +391,16 @@ class CedModel(CedPreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
     def forward_features(self, x: torch.Tensor) -> torch.Tensor:
         x = self.patch_embed(x)
         _, _, _, t = x.shape
         x = x + self.time_pos_embed[:, :, :, :t]
-        x = (
-            x + self.freq_pos_embed[:, :, :, :]
-        )  # Just to support __getitem__ in posembed
         # x = rearrange(x, 'b c f t -> b (f t) c')
         x = torch.permute(torch.flatten(x, 2, 3), (0, 2, 1))
@@ -442,9 +429,7 @@ class CedModel(CedPreTrainedModel):
             if splits[-1].shape[-1] < self.maximal_allowed_length:
                 if self.config.pad_last:
-                    pad = torch.zeros(
-                        *x.shape[:-1], self.maximal_allowed_length, device=x.device
-                    )
                     pad[..., : splits[-1].shape[-1]] = splits[-1]
                     splits = torch.stack((*splits[:-1], pad), dim=0)
                 else:
@@ -497,9 +482,7 @@ class CedForAudioClassification(CedPreTrainedModel):
         elif self.config.pooling == "dm":
             # Unpack using the frequency dimension, which is constant
             # 'b (f t) d -> b f t d', f=self.patch_embed.grid_size[0])
-            x = torch.reshape(
-                x, (x.shape[0], self.patch_embed.grid_size[0], -1, x.shape[3])
-            )
             # First poolin frequency, then sigmoid the (B T D) output
             x = self.outputlayer(x.mean(1)).sigmoid()
@@ -507,9 +490,10 @@ class CedForAudioClassification(CedPreTrainedModel):
         else:
             return x.mean(1)
-    @add_start_docstrings_to_model_forward(
-        CED_INPUTS_DOCSTRING.format("batch_size, sequence_length")
-    )
     @add_code_sample_docstrings(
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
@@ -519,9 +503,7 @@ class CedForAudioClassification(CedPreTrainedModel):
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
         expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
-    def forward(
-        self, input_values: torch.Tensor, labels: Optional[torch.Tensor] = None
-    ):
         """
         Runs a forward pass of the CED model for audio classification task.
@@ -554,14 +536,15 @@ class CedForAudioClassification(CedPreTrainedModel):
         logits = self.forward_head(last_hidden_states)
         if labels is not None:
-            loss_fct = nn.BCEWithLogitsLoss()
-            labels = nn.functional.one_hot(
-                labels, num_classes=self.config.outputdim
-            ).float()
             loss = loss_fct(logits, labels)
         else:
             loss = None
-        return SequenceClassifierOutput(
-            logits=logits, loss=loss, hidden_states=last_hidden_states
-        )

         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = flatten
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride)
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
     def forward(self, x):
     def forward(self, x):
         B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
         q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
         attn = (q @ k.transpose(-2, -1)) * self.scale
         return f"drop_prob={round(self.drop_prob,3):0.3f}"
+def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
     This is the same as the DropConnect impl I (https://github.com/rwightman) created for EfficientNet, etc networks,
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
     if keep_prob > 0.0 and scale_by_keep:
         random_tensor.div_(keep_prob)
             patch_stride=config.patch_stride,
         )
+        self.time_pos_embed = nn.Parameter(torch.randn(1, config.embed_dim, 1, self.patch_embed.grid_size[1]) * 0.02)
+        self.freq_pos_embed = nn.Parameter(torch.randn(1, config.embed_dim, self.patch_embed.grid_size[0], 1) * 0.02)
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
         act_layer = nn.GELU
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.depth)]  # stochastic depth decay rule
         self.pos_drop = nn.Dropout(p=config.drop_rate)
         self.blocks = nn.Sequential(
             *[
         # Initialize weights and apply final processing
         self.post_init()
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
     def forward_features(self, x: torch.Tensor) -> torch.Tensor:
         x = self.patch_embed(x)
         _, _, _, t = x.shape
         x = x + self.time_pos_embed[:, :, :, :t]
+        x = x + self.freq_pos_embed[:, :, :, :]  # Just to support __getitem__ in posembed
         # x = rearrange(x, 'b c f t -> b (f t) c')
         x = torch.permute(torch.flatten(x, 2, 3), (0, 2, 1))
             if splits[-1].shape[-1] < self.maximal_allowed_length:
                 if self.config.pad_last:
+                    pad = torch.zeros(*x.shape[:-1], self.maximal_allowed_length, device=x.device)
                     pad[..., : splits[-1].shape[-1]] = splits[-1]
                     splits = torch.stack((*splits[:-1], pad), dim=0)
                 else:
         elif self.config.pooling == "dm":
             # Unpack using the frequency dimension, which is constant
             # 'b (f t) d -> b f t d', f=self.patch_embed.grid_size[0])
+            x = torch.reshape(x, (x.shape[0], self.patch_embed.grid_size[0], -1, x.shape[3]))
             # First poolin frequency, then sigmoid the (B T D) output
             x = self.outputlayer(x.mean(1)).sigmoid()
         else:
             return x.mean(1)
+    def freeze_encoder(self):
+        self.encoder._freeze_parameters()
+    @add_start_docstrings_to_model_forward(CED_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         checkpoint=_SEQ_CLASS_CHECKPOINT,
         output_type=SequenceClassifierOutput,
         expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
         expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
     )
+    def forward(self, input_values: torch.Tensor, labels: Optional[torch.Tensor] = None):
         """
         Runs a forward pass of the CED model for audio classification task.
         logits = self.forward_head(last_hidden_states)
         if labels is not None:
+            if self.config.loss == "CE":
+                loss_fct = nn.CrossEntropyLoss()
+            elif self.config.loss == "BCE":
+                loss_fct = nn.BCEWithLogitsLoss()
+            else:
+                raise NotImplementedError("Need to set 'CE' or 'BCE' as config.loss.")
+            labels = nn.functional.one_hot(labels, num_classes=self.config.outputdim).float()
             loss = loss_fct(logits, labels)
         else:
             loss = None
+        return SequenceClassifierOutput(logits=logits, loss=loss, hidden_states=last_hidden_states)