# Copyright (c) 2022, Horizon Inc. Xingchen Song (sxc19@tsinghua.org.cn)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NOTE(xcsong): Currently, we only support
1. specific conformer encoder architecture, see:
    encoder: conformer
    encoder_conf:
      activation_type: **must be** relu
      attention_heads: 2 or 4 or 8 or any number divisible by output_size
      causal: **must be** true
      cnn_module_kernel: 1 ~ 7
      cnn_module_norm: **must be** batch_norm
      input_layer: **must be** conv2d8
      linear_units: 1 ~ 2048
      normalize_before: **must be** true
      num_blocks: 1 ~ 12
      output_size: 1 ~ 512
      pos_enc_layer_type: **must be** no_pos
      selfattention_layer_type: **must be** selfattn
      use_cnn_module: **must be** true
      use_dynamic_chunk: **must be** true
      use_dynamic_left_chunk: **must be** true

2. specific decoding method: ctc_greedy_search
"""

from __future__ import print_function

import os
import sys
import copy
import math
import yaml
import logging
from typing import Tuple

import torch
import numpy as np

from wenet.transformer.embedding import NoPositionalEncoding
from wenet.utils.init_model import init_model
from wenet.bin.export_onnx_cpu import (get_args, to_numpy,
                                       print_input_output_info)

try:
    import onnx
    import onnxruntime
except ImportError:
    print('Please install onnx and onnxruntime!')
    sys.exit(1)

logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)


class BPULayerNorm(torch.nn.Module):
    """Refactor torch.nn.LayerNorm to meet 4-D dataflow."""

    def __init__(self, module, chunk_size=8, run_on_bpu=False):
        super().__init__()
        original = copy.deepcopy(module)
        self.hidden = module.weight.size(0)
        self.chunk_size = chunk_size
        self.run_on_bpu = run_on_bpu

        if self.run_on_bpu:
            self.weight = torch.nn.Parameter(
                module.weight.reshape(1, self.hidden, 1,
                                      1).repeat(1, 1, 1, chunk_size))
            self.bias = torch.nn.Parameter(
                module.bias.reshape(1, self.hidden, 1,
                                    1).repeat(1, 1, 1, chunk_size))
            self.negtive = torch.nn.Parameter(
                torch.ones((1, self.hidden, 1, chunk_size)) * -1.0)
            self.eps = torch.nn.Parameter(
                torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps)
            self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False)
            self.mean_conv_1.weight = torch.nn.Parameter(
                torch.ones(self.hidden, self.hidden, 1, 1) /
                (1.0 * self.hidden))
            self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False)
            self.mean_conv_2.weight = torch.nn.Parameter(
                torch.ones(self.hidden, self.hidden, 1, 1) /
                (1.0 * self.hidden))
        else:
            self.norm = module

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, self.chunk_size, self.hidden)
        orig_out = module(random_data)
        new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2))
        np.testing.assert_allclose(to_numpy(orig_out),
                                   to_numpy(
                                       new_out.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.run_on_bpu:
            u = self.mean_conv_1(x)  # (1, h, 1, c)
            numerator = x + u * self.negtive  # (1, h, 1, c)
            s = torch.pow(numerator, 2)  # (1, h, 1, c)
            s = self.mean_conv_2(s)  # (1, h, 1, c)
            denominator = torch.sqrt(s + self.eps)  # (1, h, 1, c)
            x = torch.div(numerator, denominator)  # (1, h, 1, c)
            x = x * self.weight + self.bias
        else:
            x = x.squeeze(2).transpose(1, 2).contiguous()
            x = self.norm(x)
            x = x.transpose(1, 2).contiguous().unsqueeze(2)
        return x


class BPUIdentity(torch.nn.Module):
    """Refactor torch.nn.Identity().
       For inserting BPU node whose input == output.
    """

    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        self.identity_conv = torch.nn.Conv2d(channels,
                                             channels,
                                             1,
                                             groups=channels,
                                             bias=False)
        torch.nn.init.dirac_(self.identity_conv.weight.data, groups=channels)

        self.check_equal()

    def check_equal(self):
        random_data = torch.randn(1, self.channels, 1, 10)
        result = self.forward(random_data)
        np.testing.assert_allclose(to_numpy(random_data),
                                   to_numpy(result),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Identity with 4-D dataflow, input == output.
        Args:
            x (torch.Tensor): (batch, in_channel, 1, time)

        Returns:
            (torch.Tensor): (batch, in_channel, 1, time).
        """
        return self.identity_conv(x)


class BPULinear(torch.nn.Module):
    """Refactor torch.nn.Linear or pointwise_conv"""

    def __init__(self, module, is_pointwise_conv=False):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.idim = module.weight.size(1)
        self.odim = module.weight.size(0)
        self.is_pointwise_conv = is_pointwise_conv

        # Modify weight & bias
        self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1)
        if is_pointwise_conv:
            # (odim, idim, kernel=1) -> (odim, idim, 1, 1)
            self.linear.weight = torch.nn.Parameter(
                module.weight.unsqueeze(-1))
        else:
            # (odim, idim) -> (odim, idim, 1, 1)
            self.linear.weight = torch.nn.Parameter(
                module.weight.unsqueeze(2).unsqueeze(3))
        self.linear.bias = module.bias

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, 8, self.idim)
        if self.is_pointwise_conv:
            random_data = random_data.transpose(1, 2)
        original_result = module(random_data)
        if self.is_pointwise_conv:
            random_data = random_data.transpose(1, 2)
            original_result = original_result.transpose(1, 2)
        random_data = random_data.transpose(1, 2).unsqueeze(2)
        new_result = self.forward(random_data)
        np.testing.assert_allclose(to_numpy(original_result),
                                   to_numpy(
                                       new_result.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Linear with 4-D dataflow.
        Args:
            x (torch.Tensor): (batch, in_channel, 1, time)
        Returns:
            (torch.Tensor): (batch, out_channel, 1, time).
        """
        return self.linear(x)


class BPUGlobalCMVN(torch.nn.Module):
    """Refactor wenet/transformer/cmvn.py::GlobalCMVN"""

    def __init__(self, module):
        super().__init__()
        # Unchanged submodules and attributes
        self.norm_var = module.norm_var

        # NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1)
        self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0)
        self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """CMVN with 4-D dataflow.
        Args:
            x (torch.Tensor): (batch, 1, mel_dim, time)
        Returns:
            (torch.Tensor): normalized feature with same shape.
        """
        x = x - self.mean
        if self.norm_var:
            x = x * self.istd
        return x


class BPUConv2dSubsampling8(torch.nn.Module):
    """Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8

    NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding
    """

    def __init__(self, module):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.right_context = module.right_context
        self.subsampling_rate = module.subsampling_rate
        assert isinstance(module.pos_enc, NoPositionalEncoding)

        # 1. Modify self.conv
        # NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim)
        #   to (1, 1, mel_dim, frames) for more efficient computation.
        self.conv = module.conv
        for idx in [0, 2, 4]:
            self.conv[idx].weight = torch.nn.Parameter(
                module.conv[idx].weight.transpose(2, 3))

        # 2. Modify self.linear
        # NOTE(xcsong): Split final projection to meet the requirment of
        #   maximum kernel_size (7 for XJ3)
        self.linear = torch.nn.ModuleList()
        odim = module.linear.weight.size(0)  # 512, in this case
        freq = module.linear.weight.size(1) // odim  # 4608 // 512 == 9
        self.odim, self.freq = odim, freq
        weight = module.linear.weight.reshape(
            odim, odim, freq,
            1)  # (odim, odim * freq) -> (odim, odim, freq, 1)
        self.split_size = []
        num_split = (freq - 1) // 7 + 1  # XJ3 requires kernel_size <= 7
        slice_begin = 0
        for idx in range(num_split):
            kernel_size = min(freq, (idx + 1) * 7) - idx * 7
            conv_ele = torch.nn.Conv2d(odim, odim, (kernel_size, 1),
                                       (kernel_size, 1))
            conv_ele.weight = torch.nn.Parameter(
                weight[:, :, slice_begin:slice_begin + kernel_size, :])
            conv_ele.bias = torch.nn.Parameter(torch.zeros_like(conv_ele.bias))
            self.linear.append(conv_ele)
            self.split_size.append(kernel_size)
            slice_begin += kernel_size
        self.linear[0].bias = torch.nn.Parameter(module.linear.bias)

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, 67, 80)
        mask = torch.zeros(1, 1, 67)
        original_result, _, _ = module(random_data, mask)  # (1, 8, 512)
        random_data = random_data.transpose(1,
                                            2).unsqueeze(0)  # (1, 1, 80, 67)
        new_result = self.forward(random_data)  # (1, 512, 1, 8)
        np.testing.assert_allclose(to_numpy(original_result),
                                   to_numpy(
                                       new_result.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Subsample x with 4-D dataflow.
        Args:
            x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time).

        Returns:
            torch.Tensor: Subsampled tensor (#batch, odim, 1, time'),
                where time' = time // 8.
        """
        x = self.conv(x)  # (1, odim, freq, time')
        x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3))
        x = torch.split(x, self.split_size, dim=2)
        for idx, (x_part, layer) in enumerate(zip(x, self.linear)):
            x_out += layer(x_part)
        return x_out


class BPUMultiHeadedAttention(torch.nn.Module):
    """Refactor wenet/transformer/attention.py::MultiHeadedAttention

    NOTE(xcsong): Only support attention_class == MultiHeadedAttention,
        we do not consider RelPositionMultiHeadedAttention currently.
    """

    def __init__(self, module, chunk_size, left_chunks):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.d_k = module.d_k
        self.h = module.h
        n_feat = self.d_k * self.h
        self.chunk_size = chunk_size
        self.left_chunks = left_chunks
        self.time = chunk_size * (left_chunks + 1)
        self.activation = torch.nn.Softmax(dim=-1)

        # 1. Modify self.linear_x
        self.linear_q = BPULinear(module.linear_q)
        self.linear_k = BPULinear(module.linear_k)
        self.linear_v = BPULinear(module.linear_v)
        self.linear_out = BPULinear(module.linear_out)
        # 2. denom
        self.register_buffer(
            "denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k)))

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, self.chunk_size, self.d_k * self.h)
        mask = torch.ones((1, self.h, self.chunk_size, self.time),
                          dtype=torch.bool)
        cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks,
                            self.d_k * 2)
        original_out, original_cache = module(random_data, random_data,
                                              random_data, mask[:, 0, :, :],
                                              torch.empty(0), cache)
        random_data = random_data.transpose(1, 2).unsqueeze(2)
        cache = cache.reshape(1, self.h, self.d_k * 2,
                              self.chunk_size * self.left_chunks)
        new_out, new_cache = self.forward(random_data, random_data,
                                          random_data, mask, cache)
        np.testing.assert_allclose(to_numpy(original_out),
                                   to_numpy(
                                       new_out.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)
        np.testing.assert_allclose(to_numpy(original_cache),
                                   to_numpy(new_cache.transpose(2, 3)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        mask: torch.Tensor,
        cache: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute scaled dot product attention.

        Args:
            q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size).
            k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size).
            v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size).
            mask (torch.Tensor): Mask tensor,
                (#batch, head, chunk_size, cache_t + chunk_size).
            cache (torch.Tensor): Cache tensor
                (1, head, d_k * 2, cache_t),
                where `cache_t == chunk_size * left_chunks`.


        Returns:
            torch.Tensor: Output tensor (#batch, size, 1, chunk_size).
            torch.Tensor: Cache tensor
                (1, head, d_k * 2, cache_t + chunk_size)
                where `cache_t == chunk_size * left_chunks`
        """
        # 1. Forward QKV
        q = self.linear_q(q)  # (1, d, 1, c) d == size, c == chunk_size
        k = self.linear_k(k)  # (1, d, 1, c)
        v = self.linear_v(v)  # (1, d, 1, c)
        q = q.view(1, self.h, self.d_k, self.chunk_size)
        k = k.view(1, self.h, self.d_k, self.chunk_size)
        v = v.view(1, self.h, self.d_k, self.chunk_size)
        q = q.transpose(2, 3)  # (batch, head, time1, d_k)
        k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2)
        k = torch.cat((k_cache, k), dim=3)
        v = torch.cat((v_cache, v), dim=3)
        new_cache = torch.cat((k, v), dim=2)
        # 2. (Q^T)K
        scores = torch.matmul(q, k) * self.denom  # (#b, n_head, time1, time2)
        # 3. Forward attention
        mask = mask.eq(0)
        scores = scores.masked_fill(mask, -float('inf'))
        attn = self.activation(scores).masked_fill(mask, 0.0)
        attn = attn.transpose(2, 3)
        x = torch.matmul(v, attn)
        x = x.view(1, self.d_k * self.h, 1, self.chunk_size)
        x_out = self.linear_out(x)
        return x_out, new_cache


class BPUConvolution(torch.nn.Module):
    """Refactor wenet/transformer/convolution.py::ConvolutionModule

    NOTE(xcsong): Only suport use_layer_norm == False
    """

    def __init__(self, module):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.lorder = module.lorder
        self.use_layer_norm = False
        self.activation = module.activation
        channels = module.pointwise_conv1.weight.size(1)
        self.channels = channels
        kernel_size = module.depthwise_conv.weight.size(2)
        assert module.use_layer_norm is False

        # 1. Modify self.pointwise_conv1
        self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True)

        # 2. Modify self.depthwise_conv
        self.depthwise_conv = torch.nn.Conv2d(channels,
                                              channels, (1, kernel_size),
                                              stride=1,
                                              groups=channels)
        self.depthwise_conv.weight = torch.nn.Parameter(
            module.depthwise_conv.weight.unsqueeze(-2))
        self.depthwise_conv.bias = torch.nn.Parameter(
            module.depthwise_conv.bias)

        # 3. Modify self.norm, Only support batchnorm2d
        self.norm = torch.nn.BatchNorm2d(channels)
        self.norm.training = False
        self.norm.num_features = module.norm.num_features
        self.norm.eps = module.norm.eps
        self.norm.momentum = module.norm.momentum
        self.norm.weight = torch.nn.Parameter(module.norm.weight)
        self.norm.bias = torch.nn.Parameter(module.norm.bias)
        self.norm.running_mean = module.norm.running_mean
        self.norm.running_var = module.norm.running_var

        # 4. Modify self.pointwise_conv2
        self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True)

        # 5. Identity conv, for running `concat` on BPU
        self.identity = BPUIdentity(channels)

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, 8, self.channels)
        cache = torch.zeros((1, self.channels, self.lorder))
        original_out, original_cache = module(random_data, cache=cache)
        random_data = random_data.transpose(1, 2).unsqueeze(2)
        cache = cache.unsqueeze(2)
        new_out, new_cache = self.forward(random_data, cache)
        np.testing.assert_allclose(to_numpy(original_out),
                                   to_numpy(
                                       new_out.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)
        np.testing.assert_allclose(to_numpy(original_cache),
                                   to_numpy(new_cache.squeeze(2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor,
                cache: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute convolution module.
        Args:
            x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size).
            cache (torch.Tensor): left context cache, it is only
                used in causal convolution (#batch, channels, 1, cache_t).
        Returns:
            torch.Tensor: Output tensor (#batch, channels, 1, chunk_size).
            torch.Tensor: Cache tensor (#batch, channels, 1, cache_t).
        """
        # Concat cache
        x = torch.cat((self.identity(cache), self.identity(x)), dim=3)
        new_cache = x[:, :, :, -self.lorder:]

        # GLU mechanism
        x = self.pointwise_conv1(x)  # (batch, 2*channel, 1, dim)
        x = torch.nn.functional.glu(x, dim=1)  # (b, channel, 1, dim)

        # Depthwise Conv
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))
        x = self.pointwise_conv2(x)
        return x, new_cache


class BPUFFN(torch.nn.Module):
    """Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward
    """

    def __init__(self, module):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.activation = module.activation

        # 1. Modify self.w_x
        self.w_1 = BPULinear(module.w_1)
        self.w_2 = BPULinear(module.w_2)

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, 8, self.w_1.idim)
        original_out = module(random_data)
        random_data = random_data.transpose(1, 2).unsqueeze(2)
        new_out = self.forward(random_data)
        np.testing.assert_allclose(to_numpy(original_out),
                                   to_numpy(
                                       new_out.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward function.

        Args:
            xs: input tensor (B, D, 1, L)
        Returns:
            output tensor, (B, D, 1, L)
        """
        return self.w_2(self.activation(self.w_1(x)))


class BPUConformerEncoderLayer(torch.nn.Module):
    """Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer
    """

    def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.size = module.size
        assert module.normalize_before is True
        assert module.concat_after is False

        # 1. Modify submodules
        self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron)
        self.self_attn = BPUMultiHeadedAttention(module.self_attn, chunk_size,
                                                 left_chunks)
        self.conv_module = BPUConvolution(module.conv_module)
        self.feed_forward = BPUFFN(module.feed_forward)

        # 2. Modify norms
        self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu)
        self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size,
                                     ln_run_on_bpu)
        self.norm_ff_macron = BPULayerNorm(module.norm_ff_macaron, chunk_size,
                                           ln_run_on_bpu)
        self.norm_conv = BPULayerNorm(module.norm_conv, chunk_size,
                                      ln_run_on_bpu)
        self.norm_final = BPULayerNorm(module.norm_final, chunk_size,
                                       ln_run_on_bpu)

        # 3. 4-D ff_scale
        self.register_buffer("ff_scale",
                             torch.full((1, self.size, 1, 1), module.ff_scale))

        self.check_equal(original)

    def check_equal(self, module):
        time1 = self.self_attn.chunk_size
        time2 = self.self_attn.time
        h, d_k = self.self_attn.h, self.self_attn.d_k
        random_x = torch.randn(1, time1, self.size)
        att_mask = torch.ones(1, h, time1, time2)
        att_cache = torch.zeros(1, h, time2 - time1, d_k * 2)
        cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder)
        original_x, _, original_att_cache, original_cnn_cache = module(
            random_x,
            att_mask[:, 0, :, :],
            torch.empty(0),
            att_cache=att_cache,
            cnn_cache=cnn_cache)
        random_x = random_x.transpose(1, 2).unsqueeze(2)
        att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1)
        cnn_cache = cnn_cache.unsqueeze(2)
        new_x, new_att_cache, new_cnn_cache = self.forward(
            random_x, att_mask, att_cache, cnn_cache)
        np.testing.assert_allclose(to_numpy(original_att_cache),
                                   to_numpy(new_att_cache.transpose(2, 3)),
                                   rtol=1e-02,
                                   atol=1e-03)
        np.testing.assert_allclose(to_numpy(original_x),
                                   to_numpy(new_x.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)
        np.testing.assert_allclose(to_numpy(original_cnn_cache),
                                   to_numpy(new_cnn_cache.squeeze(2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(
        self, x: torch.Tensor, att_mask: torch.Tensor, att_cache: torch.Tensor,
        cnn_cache: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Compute encoded features.

        Args:
            x (torch.Tensor): (#batch, size, 1, chunk_size)
            att_mask (torch.Tensor): Mask tensor for the input
                (#batch, head, chunk_size, cache_t1 + chunk_size),
            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
                (#batch=1, head, d_k * 2, cache_t1), head * d_k == size.
            cnn_cache (torch.Tensor): Convolution cache in conformer layer
                (#batch=1, size, 1, cache_t2)
        Returns:
            torch.Tensor: Output tensor (#batch, size, 1, chunk_size).
            torch.Tensor: att_cache tensor,
                (1, head, d_k * 2, cache_t1 + chunk_size).
            torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2).
        """
        # 1. ffn_macaron
        residual = x
        x = self.norm_ff_macron(x)
        x = residual + self.ff_scale * self.feed_forward_macaron(x)

        # 2. attention
        residual = x
        x = self.norm_mha(x)
        x_att, new_att_cache = self.self_attn(x, x, x, att_mask, att_cache)
        x = residual + x_att

        # 3. convolution
        residual = x
        x = self.norm_conv(x)
        x, new_cnn_cache = self.conv_module(x, cnn_cache)
        x = residual + x

        # 4. ffn
        residual = x
        x = self.norm_ff(x)
        x = residual + self.ff_scale * self.feed_forward(x)

        # 5. final post-norm
        x = self.norm_final(x)

        return x, new_att_cache, new_cnn_cache


class BPUConformerEncoder(torch.nn.Module):
    """Refactor wenet/transformer/encoder.py::ConformerEncoder
    """

    def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        output_size = module.output_size()
        self._output_size = module.output_size()
        self.after_norm = module.after_norm
        self.chunk_size = chunk_size
        self.left_chunks = left_chunks
        self.head = module.encoders[0].self_attn.h
        self.layers = len(module.encoders)

        # 1. Modify submodules
        self.global_cmvn = BPUGlobalCMVN(module.global_cmvn)
        self.embed = BPUConv2dSubsampling8(module.embed)
        self.encoders = torch.nn.ModuleList()
        for layer in module.encoders:
            self.encoders.append(
                BPUConformerEncoderLayer(layer, chunk_size, left_chunks,
                                         ln_run_on_bpu))

        # 2. Auxiliary conv
        self.identity_cnncache = BPUIdentity(output_size)

        self.check_equal(original)

    def check_equal(self, module):
        time1 = self.encoders[0].self_attn.chunk_size
        time2 = self.encoders[0].self_attn.time
        layers = self.layers
        h, d_k = self.head, self.encoders[0].self_attn.d_k
        decoding_window = (self.chunk_size - 1) * \
            module.embed.subsampling_rate + \
            module.embed.right_context + 1
        lorder = self.encoders[0].conv_module.lorder
        random_x = torch.randn(1, decoding_window, 80)
        att_mask = torch.ones(1, h, time1, time2)
        att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2)
        cnn_cache = torch.zeros(layers, 1, self._output_size, lorder)
        orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk(
            random_x,
            0,
            time2 - time1,
            att_mask=att_mask[:, 0, :, :],
            att_cache=att_cache,
            cnn_cache=cnn_cache)
        random_x = random_x.unsqueeze(0)
        att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1)
        cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder)
        new_x, new_att_cache, new_cnn_cache = self.forward(
            random_x, att_cache, cnn_cache, att_mask)
        caches = torch.split(new_att_cache, h, dim=1)
        caches = [c.transpose(2, 3) for c in caches]
        np.testing.assert_allclose(to_numpy(orig_att_cache),
                                   to_numpy(torch.cat(caches, dim=0)),
                                   rtol=1e-02,
                                   atol=1e-03)
        np.testing.assert_allclose(to_numpy(orig_x),
                                   to_numpy(new_x.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)
        np.testing.assert_allclose(
            to_numpy(orig_cnn_cache),
            to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)),
            rtol=1e-02,
            atol=1e-03)

    def forward(
        self, xs: torch.Tensor, att_cache: torch.Tensor,
        cnn_cache: torch.Tensor, att_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """ Forward just one chunk

        Args:
            xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim),
                where `time == (chunk_size - 1) * subsample_rate + \
                        subsample.right_context + 1`
            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
                transformer/conformer attention, with shape
                (1, head * elayers, d_k * 2, cache_t1), where
                `head * d_k == hidden-dim` and
                `cache_t1 == chunk_size * left_chunks`.
            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
                (1, hidden-dim, elayers, cache_t2), where
                `cache_t2 == cnn.lorder - 1`
            att_mask (torch.Tensor): Mask tensor for the input
                (#batch, head, chunk_size, cache_t1 + chunk_size),

        Returns:
            torch.Tensor: output of current input xs,
                with shape (b=1, hidden-dim, 1, chunk_size).
            torch.Tensor: new attention cache required for next chunk, with
                same shape as the original att_cache.
            torch.Tensor: new conformer cnn cache required for next chunk, with
                same shape as the original cnn_cache.
        """
        # xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time)
        xs = xs.transpose(2, 3)
        xs = self.global_cmvn(xs)
        # xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size)
        xs = self.embed(xs)

        att_cache = torch.split(att_cache, self.head, dim=1)
        cnn_cache = self.identity_cnncache(cnn_cache)
        cnn_cache = torch.split(cnn_cache, 1, dim=2)
        r_att_cache = []
        r_cnn_cache = []
        for i, layer in enumerate(self.encoders):
            xs, new_att_cache, new_cnn_cache = layer(xs,
                                                     att_mask,
                                                     att_cache=att_cache[i],
                                                     cnn_cache=cnn_cache[i])
            r_att_cache.append(new_att_cache[:, :, :, self.chunk_size:])
            r_cnn_cache.append(new_cnn_cache)
        r_att_cache = torch.cat(r_att_cache, dim=1)
        r_cnn_cache = self.identity_cnncache(torch.cat(r_cnn_cache, dim=2))

        xs = xs.squeeze(2).transpose(1, 2).contiguous()
        xs = self.after_norm(xs)
        # NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input.
        xs = xs.transpose(1, 2).contiguous().unsqueeze(2)  # (B, C, 1, T)

        return (xs, r_att_cache, r_cnn_cache)


class BPUCTC(torch.nn.Module):
    """Refactor wenet/transformer/ctc.py::CTC
    """

    def __init__(self, module):
        super().__init__()
        # Unchanged submodules and attributes
        original = copy.deepcopy(module)
        self.idim = module.ctc_lo.weight.size(1)
        num_class = module.ctc_lo.weight.size(0)

        # 1. Modify self.ctc_lo, Split final projection to meet the
        #   requirment of maximum in/out channels (2048 for XJ3)
        self.ctc_lo = torch.nn.ModuleList()
        self.split_size = []
        num_split = (num_class - 1) // 2048 + 1
        for idx in range(num_split):
            out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048
            conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1)
            self.ctc_lo.append(conv_ele)
            self.split_size.append(out_channel)
        orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0)
        orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0)
        for i, (w, b) in enumerate(zip(orig_weight, orig_bias)):
            w = w.unsqueeze(2).unsqueeze(3)
            self.ctc_lo[i].weight = torch.nn.Parameter(w)
            self.ctc_lo[i].bias = torch.nn.Parameter(b)

        self.check_equal(original)

    def check_equal(self, module):
        random_data = torch.randn(1, 100, self.idim)
        original_result = module.ctc_lo(random_data)
        random_data = random_data.transpose(1, 2).unsqueeze(2)
        new_result = self.forward(random_data)
        np.testing.assert_allclose(to_numpy(original_result),
                                   to_numpy(
                                       new_result.squeeze(2).transpose(1, 2)),
                                   rtol=1e-02,
                                   atol=1e-03)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """frame activations, without softmax.

        Args:
            Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size)
        Returns:
            torch.Tensor: (B, num_class, 1, chunk_size)
        """
        out = []
        for i, layer in enumerate(self.ctc_lo):
            out.append(layer(x))
        out = torch.cat(out, dim=1)
        return out


def export_encoder(asr_model, args):
    logger.info("Stage-1: export encoder")
    decode_window, mel_dim = args.decoding_window, args.feature_size
    encoder = BPUConformerEncoder(asr_model.encoder, args.chunk_size,
                                  args.num_decoding_left_chunks,
                                  args.ln_run_on_bpu)
    encoder.eval()
    encoder_outpath = os.path.join(args.output_dir, 'encoder.onnx')

    logger.info("Stage-1.1: prepare inputs for encoder")
    chunk = torch.randn((1, 1, decode_window, mel_dim))
    required_cache_size = encoder.chunk_size * encoder.left_chunks
    kv_time = required_cache_size + encoder.chunk_size
    hidden, layers = encoder._output_size, len(encoder.encoders)
    head = encoder.encoders[0].self_attn.h
    d_k = hidden // head
    lorder = encoder.encoders[0].conv_module.lorder
    att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size)
    att_mask = torch.ones((1, head, encoder.chunk_size, kv_time))
    att_mask[:, :, :, :required_cache_size] = 0
    cnn_cache = torch.zeros((1, hidden, layers, lorder))
    inputs = (chunk, att_cache, cnn_cache, att_mask)
    logger.info("chunk.size(): {} att_cache.size(): {} "
                "cnn_cache.size(): {} att_mask.size(): {}".format(
                    list(chunk.size()), list(att_cache.size()),
                    list(cnn_cache.size()), list(att_mask.size())))

    logger.info("Stage-1.2: torch.onnx.export")
    # NOTE(xcsong): Below attributes will be used in
    #   onnx2horizonbin.py::generate_config()
    attributes = {}
    attributes['input_name'] = "chunk;att_cache;cnn_cache;att_mask"
    attributes['output_name'] = "output;r_att_cache;r_cnn_cache"
    attributes['input_type'] = "featuremap;featuremap;featuremap;featuremap"
    attributes['norm_type'] = \
        "no_preprocess;no_preprocess;no_preprocess;no_preprocess"
    attributes['input_layout_train'] = "NCHW;NCHW;NCHW;NCHW"
    attributes['input_layout_rt'] = "NCHW;NCHW;NCHW;NCHW"
    attributes['input_shape'] = \
        "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format(
        chunk.size(0), chunk.size(1), chunk.size(2), chunk.size(3),
        att_cache.size(0), att_cache.size(1), att_cache.size(2),
        att_cache.size(3), cnn_cache.size(0), cnn_cache.size(1),
        cnn_cache.size(2), cnn_cache.size(3), att_mask.size(0),
        att_mask.size(1), att_mask.size(2), att_mask.size(3)
    )
    torch.onnx.export(  # NOTE(xcsong): only support opset==11
        encoder,
        inputs,
        encoder_outpath,
        opset_version=11,
        export_params=True,
        do_constant_folding=True,
        input_names=attributes['input_name'].split(';'),
        output_names=attributes['output_name'].split(';'),
        dynamic_axes=None,
        verbose=False)
    onnx_encoder = onnx.load(encoder_outpath)
    for k in vars(args):
        meta = onnx_encoder.metadata_props.add()
        meta.key, meta.value = str(k), str(getattr(args, k))
    for k in attributes:
        meta = onnx_encoder.metadata_props.add()
        meta.key, meta.value = str(k), str(attributes[k])
    onnx.checker.check_model(onnx_encoder)
    onnx.helper.printable_graph(onnx_encoder.graph)
    onnx.save(onnx_encoder, encoder_outpath)
    print_input_output_info(onnx_encoder, "onnx_encoder")
    logger.info('Export onnx_encoder, done! see {}'.format(encoder_outpath))

    logger.info("Stage-1.3: check onnx_encoder and torch_encoder")
    torch_output = []
    torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask)
    torch_att_cache = copy.deepcopy(att_cache)
    torch_cnn_cache = copy.deepcopy(cnn_cache)
    for i in range(10):
        logger.info("torch chunk-{}: {}, att_cache: {}, cnn_cache: {}"
                    ", att_mask: {}".format(i, list(torch_chunk.size()),
                                            list(torch_att_cache.size()),
                                            list(torch_cnn_cache.size()),
                                            list(torch_att_mask.size())))
        torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1
        out, torch_att_cache, torch_cnn_cache = encoder(
            torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask)
        torch_output.append(out)
    torch_output = torch.cat(torch_output, dim=-1)

    onnx_output = []
    onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask)
    onnx_att_cache = to_numpy(att_cache)
    onnx_cnn_cache = to_numpy(cnn_cache)
    ort_session = onnxruntime.InferenceSession(encoder_outpath)
    input_names = [node.name for node in onnx_encoder.graph.input]
    for i in range(10):
        logger.info("onnx  chunk-{}: {}, att_cache: {}, cnn_cache: {},"
                    " att_mask: {}".format(i, onnx_chunk.shape,
                                           onnx_att_cache.shape,
                                           onnx_cnn_cache.shape,
                                           onnx_att_mask.shape))
        onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)):] = 1
        ort_inputs = {
            'chunk': onnx_chunk,
            'att_cache': onnx_att_cache,
            'cnn_cache': onnx_cnn_cache,
            'att_mask': onnx_att_mask,
        }
        ort_outs = ort_session.run(None, ort_inputs)
        onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2]
        onnx_output.append(ort_outs[0])
    onnx_output = np.concatenate(onnx_output, axis=-1)

    np.testing.assert_allclose(to_numpy(torch_output),
                               onnx_output,
                               rtol=1e-03,
                               atol=1e-04)
    meta = ort_session.get_modelmeta()
    logger.info("custom_metadata_map={}".format(meta.custom_metadata_map))
    logger.info("Check onnx_encoder, pass!")
    return encoder, ort_session


def export_ctc(asr_model, args):
    logger.info("Stage-2: export ctc")
    ctc = BPUCTC(asr_model.ctc).eval()
    ctc_outpath = os.path.join(args.output_dir, 'ctc.onnx')

    logger.info("Stage-2.1: prepare inputs for ctc")
    hidden = torch.randn((1, args.output_size, 1, args.chunk_size))

    logger.info("Stage-2.2: torch.onnx.export")
    # NOTE(xcsong): Below attributes will be used in
    #   onnx2horizonbin.py::generate_config()
    attributes = {}
    attributes['input_name'], attributes['input_type'] = "hidden", "featuremap"
    attributes['norm_type'] = "no_preprocess"
    attributes['input_layout_train'] = "NCHW"
    attributes['input_layout_rt'] = "NCHW"
    attributes['input_shape'] = "{}x{}x{}x{}".format(
        hidden.size(0),
        hidden.size(1),
        hidden.size(2),
        hidden.size(3),
    )
    torch.onnx.export(ctc,
                      hidden,
                      ctc_outpath,
                      opset_version=11,
                      export_params=True,
                      do_constant_folding=True,
                      input_names=['hidden'],
                      output_names=['probs'],
                      dynamic_axes=None,
                      verbose=False)
    onnx_ctc = onnx.load(ctc_outpath)
    for k in vars(args):
        meta = onnx_ctc.metadata_props.add()
        meta.key, meta.value = str(k), str(getattr(args, k))
    for k in attributes:
        meta = onnx_ctc.metadata_props.add()
        meta.key, meta.value = str(k), str(attributes[k])
    onnx.checker.check_model(onnx_ctc)
    onnx.helper.printable_graph(onnx_ctc.graph)
    onnx.save(onnx_ctc, ctc_outpath)
    print_input_output_info(onnx_ctc, "onnx_ctc")
    logger.info('Export onnx_ctc, done! see {}'.format(ctc_outpath))

    logger.info("Stage-2.3: check onnx_ctc and torch_ctc")
    torch_output = ctc(hidden)
    ort_session = onnxruntime.InferenceSession(ctc_outpath)
    onnx_output = ort_session.run(None, {'hidden': to_numpy(hidden)})

    np.testing.assert_allclose(to_numpy(torch_output),
                               onnx_output[0],
                               rtol=1e-03,
                               atol=1e-04)
    meta = ort_session.get_modelmeta()
    logger.info("custom_metadata_map={}".format(meta.custom_metadata_map))
    logger.info("Check onnx_ctc, pass!")
    return ctc, ort_session


def export_decoder(asr_model, args):
    logger.info("Currently, Decoder is not supported.")


if __name__ == '__main__':
    torch.manual_seed(777)
    args = get_args()
    args.ln_run_on_bpu = False
    # NOTE(xcsong): XJ3 BPU only support static shapes
    assert args.chunk_size > 0
    assert args.num_decoding_left_chunks > 0
    os.system("mkdir -p " + args.output_dir)
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    with open(args.config, 'r') as fin:
        configs = yaml.load(fin, Loader=yaml.FullLoader)

    model, configs = init_model(args, configs)
    model.eval()
    print(model)

    args.feature_size = configs['input_dim']
    args.output_size = model.encoder.output_size()
    args.decoding_window = (args.chunk_size - 1) * \
        model.encoder.embed.subsampling_rate + \
        model.encoder.embed.right_context + 1

    export_encoder(model, args)
    export_ctc(model, args)
    export_decoder(model, args)