Spaces:

topdu
/

OpenOCR-Demo

Running

App Files Files Community

OpenOCR-Demo / openrec /modeling /decoders /rctc_decoder.py

topdu

openocr demo

29f689c 7 months ago

raw

history blame

2.32 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn.init import trunc_normal_

	from openrec.modeling.common import Block


	class RCTCDecoder(nn.Module):

	def __init__(self,
	in_channels,
	out_channels=6625,
	return_feats=False,
	**kwargs):
	super(RCTCDecoder, self).__init__()
	self.char_token = nn.Parameter(
	torch.zeros([1, 1, in_channels], dtype=torch.float32),
	requires_grad=True,
	)
	trunc_normal_(self.char_token, mean=0, std=0.02)
	self.fc = nn.Linear(
	in_channels,
	out_channels,
	bias=True,
	)
	self.fc_kv = nn.Linear(
	in_channels,
	2 * in_channels,
	bias=True,
	)
	self.w_atten_block = Block(dim=in_channels,
	num_heads=in_channels // 32,
	mlp_ratio=4.0,
	qkv_bias=False)
	self.out_channels = out_channels
	self.return_feats = return_feats

	def forward(self, x, data=None):

	B, C, H, W = x.shape
	x = self.w_atten_block(x.permute(0, 2, 3,
	1).reshape(-1, W, C)).reshape(
	B, H, W, C).permute(0, 3, 1, 2)
	# B, D, 8, 32
	x_kv = self.fc_kv(x.flatten(2).transpose(1, 2)).reshape(
	B, H * W, 2, C).permute(2, 0, 3, 1) # 2, b, c, hw
	x_k, x_v = x_kv.unbind(0) # b, c, hw
	char_token = self.char_token.tile([B, 1, 1])
	attn_ctc2d = char_token @ x_k # b, 1, hw
	attn_ctc2d = attn_ctc2d.reshape([-1, 1, H, W])
	attn_ctc2d = F.softmax(attn_ctc2d, 2) # b, 1, h, w
	attn_ctc2d = attn_ctc2d.permute(0, 3, 1, 2) # b, w, 1, h
	x_v = x_v.reshape(B, C, H, W)
	# B, W, H, C
	feats = attn_ctc2d @ x_v.permute(0, 3, 2, 1) # b, w, 1, c
	feats = feats.squeeze(2) # b, w, c

	predicts = self.fc(feats)

	if self.return_feats:
	result = (feats, predicts)
	else:
	result = predicts

	if not self.training:
	predicts = F.softmax(predicts, dim=2)
	result = predicts

	return result