Spaces:
Runtime error
Runtime error
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" | |
This code is refer from: | |
https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py | |
""" | |
import numpy as np | |
import paddle | |
import paddle.nn as nn | |
from ppocr.modeling.backbones.rec_svtrnet import Block, PatchEmbed, zeros_, trunc_normal_, ones_ | |
scale_dim_heads = {'tiny': [192, 3], 'small': [384, 6], 'base': [768, 12]} | |
class ViTSTR(nn.Layer): | |
def __init__(self, | |
img_size=[224, 224], | |
in_channels=1, | |
scale='tiny', | |
seqlen=27, | |
patch_size=[16, 16], | |
embed_dim=None, | |
depth=12, | |
num_heads=None, | |
mlp_ratio=4, | |
qkv_bias=True, | |
qk_scale=None, | |
drop_path_rate=0., | |
drop_rate=0., | |
attn_drop_rate=0., | |
norm_layer='nn.LayerNorm', | |
act_layer='nn.GELU', | |
epsilon=1e-6, | |
out_channels=None, | |
**kwargs): | |
super().__init__() | |
self.seqlen = seqlen | |
embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[ | |
scale][0] | |
num_heads = num_heads if num_heads is not None else scale_dim_heads[ | |
scale][1] | |
out_channels = out_channels if out_channels is not None else embed_dim | |
self.patch_embed = PatchEmbed( | |
img_size=img_size, | |
in_channels=in_channels, | |
embed_dim=embed_dim, | |
patch_size=patch_size, | |
mode='linear') | |
num_patches = self.patch_embed.num_patches | |
self.pos_embed = self.create_parameter( | |
shape=[1, num_patches + 1, embed_dim], default_initializer=zeros_) | |
self.add_parameter("pos_embed", self.pos_embed) | |
self.cls_token = self.create_parameter( | |
shape=[1, 1, embed_dim], default_initializer=zeros_) | |
self.add_parameter("cls_token", self.cls_token) | |
self.pos_drop = nn.Dropout(p=drop_rate) | |
dpr = np.linspace(0, drop_path_rate, depth) | |
self.blocks = nn.LayerList([ | |
Block( | |
dim=embed_dim, | |
num_heads=num_heads, | |
mlp_ratio=mlp_ratio, | |
qkv_bias=qkv_bias, | |
qk_scale=qk_scale, | |
drop=drop_rate, | |
attn_drop=attn_drop_rate, | |
drop_path=dpr[i], | |
norm_layer=norm_layer, | |
act_layer=eval(act_layer), | |
epsilon=epsilon, | |
prenorm=False) for i in range(depth) | |
]) | |
self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon) | |
self.out_channels = out_channels | |
trunc_normal_(self.pos_embed) | |
trunc_normal_(self.cls_token) | |
self.apply(self._init_weights) | |
def _init_weights(self, m): | |
if isinstance(m, nn.Linear): | |
trunc_normal_(m.weight) | |
if isinstance(m, nn.Linear) and m.bias is not None: | |
zeros_(m.bias) | |
elif isinstance(m, nn.LayerNorm): | |
zeros_(m.bias) | |
ones_(m.weight) | |
def forward_features(self, x): | |
B = x.shape[0] | |
x = self.patch_embed(x) | |
cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1]) | |
x = paddle.concat((cls_tokens, x), axis=1) | |
x = x + self.pos_embed | |
x = self.pos_drop(x) | |
for blk in self.blocks: | |
x = blk(x) | |
x = self.norm(x) | |
return x | |
def forward(self, x): | |
x = self.forward_features(x) | |
x = x[:, :self.seqlen] | |
return x.transpose([0, 2, 1]).unsqueeze(2) | |