Spaces:

ASLP-lab
/

OSUM

Running on Zero

OSUM / wenet /e_branchformer /encoder.py

tomxxie

适配zeroGPU

568e264 12 days ago

6.36 kB

	# Copyright (c) 2022 Yifan Peng (Carnegie Mellon University)
	# 2023 Voicecomm Inc (Kai Li)
	# 2023 Lucky Wong
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Modified from ESPnet(https://github.com/espnet/espnet)
	"""Encoder definition."""

	import torch
	from typing import List, Optional, Union
	from wenet.branchformer.encoder import LayerDropModuleList

	from wenet.e_branchformer.encoder_layer import EBranchformerEncoderLayer
	from wenet.branchformer.cgmlp import ConvolutionalGatingMLP
	from wenet.transformer.encoder import ConformerEncoder
	from wenet.utils.class_utils import (
	WENET_ACTIVATION_CLASSES,
	WENET_ATTENTION_CLASSES,
	WENET_MLP_CLASSES,
	)


	class EBranchformerEncoder(ConformerEncoder):
	"""E-Branchformer encoder module."""

	def __init__(
	self,
	input_size: int,
	output_size: int = 256,
	attention_heads: int = 4,
	linear_units: int = 2048,
	selfattention_layer_type: str = "rel_selfattn",
	pos_enc_layer_type: str = "rel_pos",
	activation_type: str = "swish",
	cgmlp_linear_units: int = 2048,
	cgmlp_conv_kernel: int = 31,
	use_linear_after_conv: bool = False,
	gate_activation: str = "identity",
	num_blocks: int = 12,
	dropout_rate: float = 0.1,
	positional_dropout_rate: float = 0.1,
	attention_dropout_rate: float = 0.0,
	input_layer: str = "conv2d",
	stochastic_depth_rate: Union[float, List[float]] = 0.0,
	static_chunk_size: int = 0,
	use_dynamic_chunk: bool = False,
	global_cmvn: torch.nn.Module = None,
	use_dynamic_left_chunk: bool = False,
	causal: bool = False,
	merge_conv_kernel: int = 3,
	use_ffn: bool = True,
	macaron_style: bool = True,
	query_bias: bool = True,
	key_bias: bool = True,
	value_bias: bool = True,
	conv_bias: bool = True,
	gradient_checkpointing: bool = False,
	use_sdpa: bool = False,
	layer_norm_type: str = 'layer_norm',
	norm_eps: float = 1e-5,
	n_kv_head: Optional[int] = None,
	head_dim: Optional[int] = None,
	mlp_type: str = 'position_wise_feed_forward',
	mlp_bias: bool = True,
	n_expert: int = 8,
	n_expert_activated: int = 2,
	):
	super().__init__(input_size,
	output_size,
	attention_heads,
	linear_units,
	num_blocks,
	dropout_rate,
	positional_dropout_rate,
	attention_dropout_rate,
	input_layer,
	pos_enc_layer_type,
	True,
	static_chunk_size,
	use_dynamic_chunk,
	global_cmvn,
	use_dynamic_left_chunk,
	1,
	macaron_style,
	selfattention_layer_type,
	activation_type,
	query_bias=query_bias,
	key_bias=key_bias,
	value_bias=value_bias,
	conv_bias=conv_bias,
	gradient_checkpointing=gradient_checkpointing,
	use_sdpa=use_sdpa,
	layer_norm_type=layer_norm_type,
	norm_eps=norm_eps,
	n_kv_head=n_kv_head,
	head_dim=head_dim,
	mlp_type=mlp_type,
	mlp_bias=mlp_bias,
	n_expert=n_expert,
	n_expert_activated=n_expert_activated)

	encoder_selfattn_layer_args = (
	attention_heads,
	output_size,
	attention_dropout_rate,
	query_bias,
	key_bias,
	value_bias,
	use_sdpa,
	n_kv_head,
	head_dim,
	)

	cgmlp_layer = ConvolutionalGatingMLP
	cgmlp_layer_args = (output_size, cgmlp_linear_units, cgmlp_conv_kernel,
	dropout_rate, use_linear_after_conv,
	gate_activation, causal)

	# feed-forward module definition
	mlp_class = WENET_MLP_CLASSES[mlp_type]
	activation = WENET_ACTIVATION_CLASSES[activation_type]()
	positionwise_layer_args = (
	output_size,
	linear_units,
	dropout_rate,
	activation,
	mlp_bias,
	n_expert,
	n_expert_activated,
	)

	if isinstance(stochastic_depth_rate, float):
	stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
	if len(stochastic_depth_rate) != num_blocks:
	raise ValueError(
	f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
	f"should be equal to num_blocks ({num_blocks})")

	self.encoders = LayerDropModuleList(
	p=stochastic_depth_rate,
	modules=[
	EBranchformerEncoderLayer(
	output_size,
	WENET_ATTENTION_CLASSES[selfattention_layer_type](
	*encoder_selfattn_layer_args),
	cgmlp_layer(*cgmlp_layer_args),
	mlp_class(*positionwise_layer_args) if use_ffn else None,
	mlp_class(*positionwise_layer_args)
	if use_ffn and macaron_style else None,
	dropout_rate,
	merge_conv_kernel=merge_conv_kernel,
	causal=causal,
	stochastic_depth_rate=stochastic_depth_rate[lnum],
	) for lnum in range(num_blocks)
	])