Spaces:
Running
Running
File size: 2,318 Bytes
29f689c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import trunc_normal_
from openrec.modeling.common import Block
class RCTCDecoder(nn.Module):
def __init__(self,
in_channels,
out_channels=6625,
return_feats=False,
**kwargs):
super(RCTCDecoder, self).__init__()
self.char_token = nn.Parameter(
torch.zeros([1, 1, in_channels], dtype=torch.float32),
requires_grad=True,
)
trunc_normal_(self.char_token, mean=0, std=0.02)
self.fc = nn.Linear(
in_channels,
out_channels,
bias=True,
)
self.fc_kv = nn.Linear(
in_channels,
2 * in_channels,
bias=True,
)
self.w_atten_block = Block(dim=in_channels,
num_heads=in_channels // 32,
mlp_ratio=4.0,
qkv_bias=False)
self.out_channels = out_channels
self.return_feats = return_feats
def forward(self, x, data=None):
B, C, H, W = x.shape
x = self.w_atten_block(x.permute(0, 2, 3,
1).reshape(-1, W, C)).reshape(
B, H, W, C).permute(0, 3, 1, 2)
# B, D, 8, 32
x_kv = self.fc_kv(x.flatten(2).transpose(1, 2)).reshape(
B, H * W, 2, C).permute(2, 0, 3, 1) # 2, b, c, hw
x_k, x_v = x_kv.unbind(0) # b, c, hw
char_token = self.char_token.tile([B, 1, 1])
attn_ctc2d = char_token @ x_k # b, 1, hw
attn_ctc2d = attn_ctc2d.reshape([-1, 1, H, W])
attn_ctc2d = F.softmax(attn_ctc2d, 2) # b, 1, h, w
attn_ctc2d = attn_ctc2d.permute(0, 3, 1, 2) # b, w, 1, h
x_v = x_v.reshape(B, C, H, W)
# B, W, H, C
feats = attn_ctc2d @ x_v.permute(0, 3, 2, 1) # b, w, 1, c
feats = feats.squeeze(2) # b, w, c
predicts = self.fc(feats)
if self.return_feats:
result = (feats, predicts)
else:
result = predicts
if not self.training:
predicts = F.softmax(predicts, dim=2)
result = predicts
return result
|