import torch import numpy as np import argparse device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class CTCLabelConverter(object): """ Convert between text-label and text-index """ def __init__(self, character): # character (str): set of the possible characters. dict_character = list(character) self.dict = {} for i, char in enumerate(dict_character): # NOTE: 0 is reserved for 'CTCblank' token required by CTCLoss self.dict[char] = i + 1 self.character = ['[CTCblank]'] + dict_character # dummy '[CTCblank]' token for CTCLoss (index 0) def encode(self, text, batch_max_length=25): """convert text-label into text-index. input: text: text labels of each image. [batch_size] batch_max_length: max length of text label in the batch. 25 by default output: text: text index for CTCLoss. [batch_size, batch_max_length] length: length of each text. [batch_size] """ length = [len(s) for s in text] # The index used for padding (=0) would not affect the CTC loss calculation. batch_text = torch.LongTensor(len(text), batch_max_length).fill_(0) for i, t in enumerate(text): text = list(t) text = [self.dict[char] for char in text] batch_text[i][:len(text)] = torch.LongTensor(text) return (batch_text.to(device), torch.IntTensor(length).to(device)) def decode(self, text_index, length): """ convert text-index into text-label. """ texts = [] for index, l in enumerate(length): t = text_index[index, :] char_list = [] for i in range(l): if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])): # removing repeated characters and blank. char_list.append(self.character[t[i]]) text = ''.join(char_list) texts.append(text) return texts class CTCLabelConverterForBaiduWarpctc(object): """ Convert between text-label and text-index for baidu warpctc """ def __init__(self, character): # character (str): set of the possible characters. dict_character = list(character) self.dict = {} for i, char in enumerate(dict_character): # NOTE: 0 is reserved for 'CTCblank' token required by CTCLoss self.dict[char] = i + 1 self.character = ['[CTCblank]'] + dict_character # dummy '[CTCblank]' token for CTCLoss (index 0) def encode(self, text, batch_max_length=25): """convert text-label into text-index. input: text: text labels of each image. [batch_size] output: text: concatenated text index for CTCLoss. [sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)] length: length of each text. [batch_size] """ length = [len(s) for s in text] text = ''.join(text) text = [self.dict[char] for char in text] return (torch.IntTensor(text), torch.IntTensor(length)) def decode(self, text_index, length): """ convert text-index into text-label. """ texts = [] index = 0 for l in length: t = text_index[index:index + l] char_list = [] for i in range(l): if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])): # removing repeated characters and blank. char_list.append(self.character[t[i]]) text = ''.join(char_list) texts.append(text) index += l return texts class AttnLabelConverter(object): """ Convert between text-label and text-index """ def __init__(self, character): # character (str): set of the possible characters. # [GO] for the start token of the attention decoder. [s] for end-of-sentence token. list_token = ['[GO]', '[s]'] # ['[s]','[UNK]','[PAD]','[GO]'] list_character = list(character) self.character = list_token + list_character self.dict = {} for i, char in enumerate(self.character): # print(i, char) self.dict[char] = i def encode(self, text, batch_max_length=25): """ convert text-label into text-index. input: text: text labels of each image. [batch_size] batch_max_length: max length of text label in the batch. 25 by default output: text : the input of attention decoder. [batch_size x (max_length+2)] +1 for [GO] token and +1 for [s] token. text[:, 0] is [GO] token and text is padded with [GO] token after [s] token. length : the length of output of attention decoder, which count [s] token also. [3, 7, ....] [batch_size] """ length = [len(s) + 1 for s in text] # +1 for [s] at end of sentence. # batch_max_length = max(length) # this is not allowed for multi-gpu setting batch_max_length += 1 # additional +1 for [GO] at first step. batch_text is padded with [GO] token after [s] token. batch_text = torch.LongTensor(len(text), batch_max_length + 1).fill_(0) for i, t in enumerate(text): text = list(t) text.append('[s]') text = [self.dict[char] for char in text] batch_text[i][1:1 + len(text)] = torch.LongTensor(text) # batch_text[:, 0] = [GO] token return (batch_text.to(device), torch.IntTensor(length).to(device)) def decode(self, text_index, length): """ convert text-index into text-label. """ texts = [] for index, l in enumerate(length): text = ''.join([self.character[i] for i in text_index[index, :]]) texts.append(text) return texts class TokenLabelConverter(object): """ Convert between text-label and text-index """ def __init__(self, opt): # character (str): set of the possible characters. # [GO] for the start token of the attention decoder. [s] for end-of-sentence token. self.SPACE = '[s]' self.GO = '[GO]' #self.MASK = '[MASK]' #self.list_token = [self.GO, self.SPACE, self.MASK] self.list_token = [self.GO, self.SPACE] self.character = self.list_token + list(opt.character) self.dict = {word: i for i, word in enumerate(self.character)} self.batch_max_length = opt.batch_max_length + len(self.list_token) def encode(self, text): """ convert text-label into text-index. """ length = [len(s) + len(self.list_token) for s in text] # +2 for [GO] and [s] at end of sentence. batch_text = torch.LongTensor(len(text), self.batch_max_length).fill_(self.dict[self.GO]) for i, t in enumerate(text): txt = [self.GO] + list(t) + [self.SPACE] txt = [self.dict[char] for char in txt] #prob = np.random.uniform() #mask_len = round(len(list(t)) * 0.15) #if is_train and mask_len > 0: # for m in range(mask_len): # index = np.random.randint(1, len(t) + 1) # prob = np.random.uniform() # if prob > 0.2: # text[index] = self.dict[self.MASK] # batch_weights[i][index] = 1. # elif prob > 0.1: # char_index = np.random.randint(len(self.list_token), len(self.character)) # text[index] = self.dict[self.character[char_index]] # batch_weights[i][index] = 1. batch_text[i][:len(txt)] = torch.LongTensor(txt) # batch_text[:, 0] = [GO] token return batch_text.to(device) def decode(self, text_index, length): """ convert text-index into text-label. """ texts = [] for index, l in enumerate(length): text = ''.join([self.character[i] for i in text_index[index, :]]) texts.append(text) return texts class SRNConverter(object): """ Convert between text-label and text-index """ def __init__(self, character, PAD=36): # character (str): set of the possible characters. # [GO] for the start token of the attention decoder. [s] for end-of-sentence token. # list_token = ['[GO]', '[s]'] # ['[s]','[UNK]','[PAD]','[GO]'] list_character = list(character) self.character = list_character self.PAD = PAD self.dict = {} for i, char in enumerate(self.character): # print(i, char) self.dict[char] = i def encode(self, text, batch_max_length=25): """ convert text-label into text-index. input: text: text labels of each image. [batch_size] batch_max_length: max length of text label in the batch. 25 by default output: text : the input of attention decoder. [batch_size x (max_length+2)] +1 for [GO] token and +1 for [s] token. text[:, 0] is [GO] token and text is padded with [GO] token after [s] token. length : the length of output of attention decoder, which count [s] token also. [3, 7, ....] [batch_size] """ length = [len(s) + 1 for s in text] # +1 for [s] at end of sentence. # additional +1 for [GO] at first step. batch_text is padded with [GO] token after [s] token. batch_text = torch.cuda.LongTensor(len(text), batch_max_length + 1).fill_(self.PAD) # mask_text = torch.cuda.LongTensor(len(text), batch_max_length).fill_(0) for i, t in enumerate(text): t = list(t + self.character[-2]) text = [self.dict[char] for char in t] # t_mask = [1 for i in range(len(text) + 1)] batch_text[i][0:len(text)] = torch.cuda.LongTensor(text) # batch_text[:, len_text+1] = [EOS] token # mask_text[i][0:len(text)+1] = torch.cuda.LongTensor(t_mask) return (batch_text, torch.cuda.IntTensor(length)) def decode(self, text_index, length): """ convert text-index into text-label. """ texts = [] for index, l in enumerate(length): text = ''.join([self.character[i] for i in text_index[index, :]]) idx = text.find('$') texts.append(text[:idx]) return texts class Averager(object): """Compute average for torch.Tensor, used for loss average.""" def __init__(self): self.reset() def add(self, v): count = v.data.numel() v = v.data.sum() self.n_count += count self.sum += v def reset(self): self.n_count = 0 self.sum = 0 def val(self): res = 0 if self.n_count != 0: res = self.sum / float(self.n_count) return res class AccuracyMeter(object): def __init__(self): self.hit = 0 self.total = 0 self.reset() ### Important to call this after calling getAccuracy() def reset(self): self.hit = 0 self.total = 0 ### boolVal - determines if a condition is hit (true), then adds it def applyHit(self, boolVal): if boolVal: self.hit += 1 self.total += 1 else: self.total += 1 def getAccuracy(self): ### Returns accuracy in range (0-1) or (-1 of number of items = 0) if self.total == 0: return -1 return float(self.hit) / self.total def get_device(verbose=True): use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") if verbose: print("Device:", device) return device def get_args(is_train=True, model=None): parser = argparse.ArgumentParser(description='STR') # for test parser.add_argument('--eval_data', help='path to evaluation dataset') parser.add_argument('--benchmark_all_eval', action='store_true', help='evaluate 10 benchmark evaluation datasets') parser.add_argument('--calculate_infer_time', action='store_true', help='calculate inference timing') parser.add_argument('--flops', action='store_true', help='calculates approx flops (may not work)') # for train parser.add_argument('--exp_name', help='Where to store logs and models') parser.add_argument('--train_data', required=is_train, help='path to training dataset') parser.add_argument('--valid_data', required=is_train, help='path to validation dataset') parser.add_argument('--manualSeed', type=int, default=1111, help='for random seed setting') parser.add_argument('--workers', type=int, help='number of data loading workers. Use -1 to use all cores.', default=4) parser.add_argument('--batch_size', type=int, default=192, help='input batch size') parser.add_argument('--num_iter', type=int, default=300000, help='number of iterations to train for') parser.add_argument('--valInterval', type=int, default=2000, help='Interval between each validation') parser.add_argument('--saved_model', default='', help="path to model to continue training") parser.add_argument('--FT', action='store_true', help='whether to do fine-tuning') parser.add_argument('--sgd', action='store_true', help='Whether to use SGD (default is Adadelta)') parser.add_argument('--adam', action='store_true', help='Whether to use adam (default is Adadelta)') parser.add_argument('--lr', type=float, default=1, help='learning rate, default=1.0 for Adadelta') parser.add_argument('--beta1', type=float, default=0.9, help='beta1 for adam. default=0.9') parser.add_argument('--rho', type=float, default=0.95, help='decay rate rho for Adadelta. default=0.95') parser.add_argument('--eps', type=float, default=1e-8, help='eps for Adadelta. default=1e-8') parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping value. default=5') parser.add_argument('--baiduCTC', action='store_true', help='for data_filtering_off mode') """ Data processing """ parser.add_argument('--select_data', type=str, default='MJ-ST', help='select training data (default is MJ-ST, which means MJ and ST used as training data)') parser.add_argument('--batch_ratio', type=str, default='0.5-0.5', help='assign ratio for each selected data in the batch') parser.add_argument('--total_data_usage_ratio', type=str, default='1.0', help='total data usage ratio, this ratio is multiplied to total number of data.') parser.add_argument('--inf_outdir', type=str, default='outdir', help='Specify output directory of influence function') parser.add_argument('--inf_mode', type=str, default='Normal', help='Normal, VanGrad, SHAP') parser.add_argument('--shap_pkl_root', type=str, default='', help='If Influence mode is SHAP, \ this is a required argument. Remove last forward slash.') parser.add_argument('--char_contrib_amnt', type=float, default=2.0, help='Multiplier on the first character for \ contribution calculation. Min:1.0. Set to -1.0 to deactivate.') # If --scorer is NA, then STR scorer will just output the single char index one-hot parser.add_argument('--scorer', type=str, default='mean', help='See STRScore: cumprod | mean') parser.add_argument('--blackbg', action='store_true', help='if True, background color for covering features will be black(0)') parser.add_argument('--shap_eval', action='store_true', help='set always to true if you want to run test_shap.py') parser.add_argument('--influence_train', action='store_true', help='if set to true, trains pretrained model with influence harmful/helpful') parser.add_argument('--selective_sample_str', type=str, default='', \ help='If =='', only sample images with string matching this (see --sensitive for case sensitivity)') parser.add_argument('--max_selective_list', type=int, default=-1, help='if selective sample list has elements greater than this, autoclear list for batch selection') parser.add_argument('--confidence_mode', type=int, default=0, help='0-sum of argmax; 1-edit distance') parser.add_argument('--batch_max_length', type=int, default=25, help='maximum-label-length') parser.add_argument('--imgH', type=int, default=32, help='the height of the input image') parser.add_argument('--imgW', type=int, default=100, help='the width of the input image') parser.add_argument('--rgb', action='store_true', help='use rgb input') parser.add_argument('--character', type=str, default='0123456789abcdefghijklmnopqrstuvwxyz', help='character label') parser.add_argument('--sensitive', action='store_true', help='for sensitive character mode') parser.add_argument('--ignore_case_sensitivity', action='store_true', help='use this only for shap testing') parser.add_argument('--PAD', action='store_true', help='whether to keep ratio then pad for image resize') parser.add_argument('--data_filtering_off', action='store_true', help='for data_filtering_off mode') """ Model Architecture """ parser.add_argument('--Transformer', action='store_true', help='Use end-to-end transformer') choices = ["vitstr_tiny_patch16_224", "vitstr_small_patch16_224", "vitstr_base_patch16_224", "vitstr_tiny_distilled_patch16_224", "vitstr_small_distilled_patch16_224"] parser.add_argument('--TransformerModel', default=choices[0], help='Which vit/deit transformer model', choices=choices) parser.add_argument('--Transformation', type=str, help='Transformation stage. None|TPS') parser.add_argument('--FeatureExtraction', type=str, help='FeatureExtraction stage. VGG|RCNN|ResNet') parser.add_argument('--SequenceModeling', type=str, help='SequenceModeling stage. None|BiLSTM') parser.add_argument('--Prediction', type=str, help='Prediction stage. None|CTC|Attn') parser.add_argument('--num_fiducial', type=int, default=20, help='number of fiducial points of TPS-STN') parser.add_argument('--input_channel', type=int, default=1, help='the number of input channel of Feature extractor') parser.add_argument('--output_channel', type=int, default=512, help='the number of output channel of Feature extractor') parser.add_argument('--hidden_size', type=int, default=256, help='the size of the LSTM hidden state') # selective augmentation (individual) # can choose specific data augmentation parser.add_argument('--issel_aug', action='store_true', help='Select augs') parser.add_argument('--sel_prob', type=float, default=1., help='Probability of applying augmentation') parser.add_argument('--pattern', action='store_true', help='Pattern group') parser.add_argument('--warp', action='store_true', help='Warp group') parser.add_argument('--geometry', action='store_true', help='Geometry group') parser.add_argument('--weather', action='store_true', help='Weather group') parser.add_argument('--noise', action='store_true', help='Noise group') parser.add_argument('--blur', action='store_true', help='Blur group') parser.add_argument('--camera', action='store_true', help='Camera group') parser.add_argument('--process', action='store_true', help='Image processing routines') parser.add_argument('--min_rand', type=int, default=0, help='minimum magnitude for aug (inclusive)') parser.add_argument('--max_rand', type=int, default=3, help='maximum magnitude for aug (exclusive)') # use cosine learning rate decay parser.add_argument('--scheduler', action='store_true', help='Use lr scheduler') parser.add_argument('--intact_prob', type=float, default=0.5, help='Probability of not applying augmentation') parser.add_argument('--isrand_aug', action='store_true', help='Use RandAug') parser.add_argument('--isshap_aug', action='store_true', help='Use SHAPAug') parser.add_argument('--augs_num', type=int, default=3, help='Number of data augment groups to apply. 1 to 8.') parser.add_argument('--augs_mag', type=int, default=None, help='Magnitude of data augment groups to apply. None if random.') # for comparison to other augmentations parser.add_argument('--issemantic_aug', action='store_true', help='Use Semantic') parser.add_argument('--isrotation_aug', action='store_true', help='Use ') parser.add_argument('--isscatter_aug', action='store_true', help='Use ') parser.add_argument('--islearning_aug', action='store_true', help='Use ') # orig paper uses this for fast benchmarking parser.add_argument('--fast_acc', action='store_true', help='Fast average accuracy computation') args = parser.parse_args() return args